import fs from "node:fs" import path from "node:path" import { extractRowsFromSQLValues } from "./utils.js" const SQL_DUMP_PATH = path.join(process.cwd(), "dump") const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql") /** * Function to clean the `page.sql` file by: * - Removing all lines that don't start with `INSERT INTO...`. * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0). * - Only keep columns `page_id` (1st column) and `page_title` (3rd column). */ const cleanPagesSQL = async () => { const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES " const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES\n" const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql") const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") const sqlInputStat = await fs.promises.stat(sqlInputPath) const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8") const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w") await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT) let isInsideInsert = false let current = "" let lastPercent = 0 return await new Promise((resolve, reject) => { sqlInputFileStream .on("data", async (dataInput) => { const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size const bytesReadPercent = bytesReadRatio * 100 if (bytesReadPercent - lastPercent >= 1) { console.log( `Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, ) lastPercent = bytesReadPercent } /** * @type {string} */ let data = current + dataInput if (!isInsideInsert) { const lines = data.split("\n").filter((line) => { return line.startsWith(INSERT_INTO_START_INPUT) }) const [line] = lines if (line == null) { sqlInputFileStream.close() return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`)) } isInsideInsert = true const lineStripped = line.slice(INSERT_INTO_START_INPUT.length) data = lineStripped } const { rows, unCompleted } = extractRowsFromSQLValues(data) current = unCompleted for (const row of rows) { if (row.length !== 12) { sqlInputFileStream.close() console.error([row]) return reject(new Error(`Invalid Row values.`)) } const id = Number.parseInt(row[0] ?? "0", 10) const namespace = row[1] ?? "" const title = row[2] ?? "" const isRedirect = row[3] === "1" if (namespace === "0" && !isRedirect) { await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8") } } }) .on("error", async (error) => { await sqlOutputFile.close() return reject(error) }) .on("close", async () => { console.log(`Cleaned "${sqlInputPath}" to "${sqlOutputPath}".`) await sqlOutputFile.appendFile(";\n", "utf-8") await sqlOutputFile.close() return resolve() }) }) } await cleanPagesSQL()