import fs from "node:fs" import path from "node:path" import { extractRowsFromSQLValues } from "./utils.js" const SQL_DUMP_PATH = path.join(process.cwd(), "dump") const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql") /** * @typedef {Record} WikipediaPages * * Object to store pages from Wikipedia: * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ). * - Value: page id. */ /** * Function to clean the `page.sql` file by: * - Removing all lines that don't start with `INSERT INTO...`. * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0). * - Only keep columns `page_id` (1st column) and `page_title` (3rd column). * @returns {Promise} */ const cleanPagesSQL = async () => { /** @type {WikipediaPages} */ const wikipediaPages = {} const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES " const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql") const sqlInputStat = await fs.promises.stat(sqlInputPath) const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8") let isInsideInsert = false let current = "" let lastPercent = 0 return await new Promise((resolve, reject) => { sqlInputFileStream .on("data", (dataInput) => { const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size const bytesReadPercent = bytesReadRatio * 100 if (bytesReadPercent - lastPercent >= 1) { console.log( `Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, ) lastPercent = bytesReadPercent } let data = current + dataInput if (!isInsideInsert) { const lines = data.split("\n").filter((line) => { return line.startsWith(INSERT_INTO_START_INPUT) }) const [line] = lines if (line == null) { sqlInputFileStream.close() return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`)) } isInsideInsert = true const lineStripped = line.slice(INSERT_INTO_START_INPUT.length) data = lineStripped } const { rows, unCompleted } = extractRowsFromSQLValues(data) current = unCompleted for (const row of rows) { if (row.length !== 12) { sqlInputFileStream.close() console.error([row]) return reject(new Error(`Invalid Row values.`)) } const id = Number.parseInt(row[0] ?? "0", 10) const namespace = row[1] ?? "" const title = row[2] ?? "" const isRedirect = row[3] === "1" if (namespace === "0" && !isRedirect) { wikipediaPages[title] = id } } }) .on("error", (error) => { return reject(error) }) .on("close", () => { return resolve(wikipediaPages) }) }) } const wikipediaPages = await cleanPagesSQL() const cleanPagesSQLWriteToFile = async () => { const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES " const wikipediaPagesString = Object.entries(wikipediaPages) .map(([title, id]) => { return `(${id},${title})` }) .join(",") await fs.promises.writeFile( sqlOutputPath, `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`, { encoding: "utf-8" }, ) } await cleanPagesSQLWriteToFile()