import fs from "node:fs" import path from "node:path" import { extractRowsFromSQLValues, swapKeysAndValues } from "./utils.js" const SQL_DUMP_PATH = path.join(process.cwd(), "dump") const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql") /** * @typedef {Record} WikipediaPagesKeyTitle * * Object to store pages from Wikipedia: * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ). * - Value: page id. */ /** * @typedef {Record} WikipediaPagesKeyId * * Object to store pages from Wikipedia: * - Key: page id. * - Value: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ). */ /** * @typedef WikipediaInternalLink * @property {number} fromPageId * @property {number} toPageId */ /** * Function to clean the `page.sql` file by: * - Removing all lines that don't start with `INSERT INTO...`. * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0). * - Only keep columns `page_id` (1st column) and `page_title` (3rd column). * @returns {Promise} */ const cleanPagesSQL = async () => { /** @type {WikipediaPagesKeyId} */ const wikipediaPagesKeyId = {} const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES " const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql") const sqlInputStat = await fs.promises.stat(sqlInputPath) const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8") let isInsideInsert = false let current = "" let lastPercent = 0 return await new Promise((resolve, reject) => { sqlInputFileStream .on("data", (dataInput) => { const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size const bytesReadPercent = bytesReadRatio * 100 if (bytesReadPercent - lastPercent >= 1) { console.log( `cleanPagesSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, ) lastPercent = bytesReadPercent } let data = current + dataInput if (!isInsideInsert) { const lines = data.split("\n").filter((line) => { return line.startsWith(INSERT_INTO_START_INPUT) }) const [line] = lines if (line == null) { sqlInputFileStream.close() return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`)) } isInsideInsert = true const lineStripped = line.slice(INSERT_INTO_START_INPUT.length) data = lineStripped } const { rows, unCompleted } = extractRowsFromSQLValues(data) current = unCompleted for (const row of rows) { if (row.length !== 12) { sqlInputFileStream.close() console.error([row]) return reject(new Error(`Invalid Row values.`)) } const id = Number.parseInt(row[0] ?? "0", 10) const namespace = row[1] ?? "" const title = row[2] ?? "" const isRedirect = row[3] === "1" if (namespace === "0" && !isRedirect) { wikipediaPagesKeyId[id] = title } } }) .on("error", (error) => { return reject(error) }) .on("close", () => { console.log("cleanPagesSQL - Bytes read (100%).") return resolve(wikipediaPagesKeyId) }) }) } const wikipediaPagesKeyId = await cleanPagesSQL() const cleanPagesSQLWriteToFile = async () => { console.log("cleanPagesSQLWriteToFile - Writing to file...") const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES " const wikipediaPagesString = Object.entries(wikipediaPagesKeyId) .map(([id, title]) => { return `(${id},${title})` }) .join(",") await fs.promises.writeFile( sqlOutputPath, `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`, { encoding: "utf-8" }, ) console.log("cleanPagesSQLWriteToFile - Done.") } await cleanPagesSQLWriteToFile() const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "3-internal-links-inserts.sql") const INSERT_INTO_START_OUTPUT = "INSERT INTO internal_links (from_page_id, to_page_id) VALUES " const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w") await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT, "utf-8") /** * Function to clean the `pagelinks.sql` file by: * - Removing all lines that don't start with `INSERT INTO...`. * - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0. * - Transform the rows to internal links with fromPageId and toPageId. * @returns {Promise} */ const cleanInternalLinksSQL = async () => { /** * @type {WikipediaPagesKeyTitle} */ const wikipediaPagesKeyTitle = swapKeysAndValues(wikipediaPagesKeyId) const INSERT_INTO_START_INPUT = "INSERT INTO `pagelinks` VALUES " const sqlInputPath = path.join(SQL_DUMP_PATH, "pagelinks.sql") const sqlInputStat = await fs.promises.stat(sqlInputPath) const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8") let isInsideInsert = false let current = "" let lastPercent = 0 const BATCH_SIZE = 10_000 /** * @type {string[]} */ let batch = [] const flushBatch = async (isLast = false) => { if (batch.length > 0) { const batchString = batch.join(",") + (isLast ? ";" : ",") await sqlOutputFile.appendFile(batchString, "utf-8") batch = [] } } return await new Promise((resolve, reject) => { sqlInputFileStream .on("data", async (dataInput) => { const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size const bytesReadPercent = bytesReadRatio * 100 if (bytesReadPercent - lastPercent >= 0.5) { console.log( `cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, ) lastPercent = bytesReadPercent } let data = current + dataInput if (!isInsideInsert) { const lines = data.split("\n").filter((line) => { return line.startsWith(INSERT_INTO_START_INPUT) }) const [line] = lines if (line == null) { sqlInputFileStream.close() return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`)) } isInsideInsert = true const lineStripped = line.slice(INSERT_INTO_START_INPUT.length) data = lineStripped } const { rows, unCompleted } = extractRowsFromSQLValues(data) current = unCompleted for (const row of rows) { if (row.length !== 5) { sqlInputFileStream.close() console.error([row]) return reject(new Error(`Invalid Row values.`)) } const plFromPageId = Number.parseInt(row[0] ?? "0", 10) const plTargetNamespace = row[1] ?? "" const plTargetTitle = row[2] ?? "" const plFromNamespace = row[3] ?? "" if (plFromNamespace === "0" && plTargetNamespace === "0") { const toPageId = wikipediaPagesKeyTitle[plTargetTitle] if (toPageId != null) { /** * @type {WikipediaInternalLink} */ const wikipediaInternalLink = { fromPageId: plFromPageId, toPageId, } batch.push( `(${wikipediaInternalLink.fromPageId},${wikipediaInternalLink.toPageId})`, ) if (batch.length >= BATCH_SIZE) { await flushBatch() } } } } }) .on("error", (error) => { return reject(error) }) .on("close", async () => { await flushBatch(true) console.log("cleanInternalLinksSQL - Bytes read (100%).") return resolve() }) }) } await cleanInternalLinksSQL()