import fs from "node:fs" import path from "node:path" import { extractRowsFromSQLValues, swapKeysAndValues, zeroPad, } from "./utils.js" const SQL_DUMP_PATH = path.join(process.cwd(), "dump") const SQL_FILENAME_NUMBER_PAD = 4 /** * @typedef {Record} WikipediaPagesKeyTitle * * Object to store pages from Wikipedia: * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ). * - Value: page id. */ /** * @typedef {Record} WikipediaPagesKeyId * * Object to store pages from Wikipedia: * - Key: page id. * - Value: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ). */ /** * @typedef WikipediaInternalLink * @property {number} fromPageId * @property {number} toPageId */ /** * Function to clean the `page.sql` file by: * - Removing all lines that don't start with `INSERT INTO...`. * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0). * - Only keep columns `page_id` (1st column) and `page_title` (3rd column). * @returns {Promise} */ const cleanPagesSQL = async () => { /** @type {WikipediaPagesKeyId} */ const wikipediaPagesKeyId = {} const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES " const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql") const sqlInputStat = await fs.promises.stat(sqlInputPath) const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8") let isInsideInsert = false let current = "" let lastPercent = 0 let pagesFileCount = 1 const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES " const BATCH_SIZE = 1_000_000 /** * @type {string[]} */ let batch = [] const flushBatch = async () => { if (batch.length > 0) { const batchString = batch.join(",") const fileName = `${zeroPad(pagesFileCount, SQL_FILENAME_NUMBER_PAD)}-pages-inserts.sql` const sqlOutputPath = path.join( process.cwd(), "sql-pages-inserts", fileName, ) await fs.promises.writeFile( sqlOutputPath, `${INSERT_INTO_START_OUTPUT}${batchString};`, { encoding: "utf-8", }, ) console.log(`flushBatch - ${fileName}, batch.length: ${batch.length}`) pagesFileCount += 1 batch = [] } } return await new Promise((resolve, reject) => { sqlInputFileStream .on("data", async (dataInput) => { const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size const bytesReadPercent = bytesReadRatio * 100 if (bytesReadPercent - lastPercent >= 1) { console.log( `cleanPagesSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, ) lastPercent = bytesReadPercent } let data = current + dataInput if (!isInsideInsert) { const lines = data.split("\n").filter((line) => { return line.startsWith(INSERT_INTO_START_INPUT) }) const [line] = lines if (line == null) { sqlInputFileStream.close() return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`)) } isInsideInsert = true const lineStripped = line.slice(INSERT_INTO_START_INPUT.length) data = lineStripped } const { rows, unCompleted } = extractRowsFromSQLValues(data) current = unCompleted for (const row of rows) { if (row.length !== 12) { sqlInputFileStream.close() console.error([row]) return reject(new Error(`Invalid Row values.`)) } const id = Number.parseInt(row[0] ?? "0", 10) const namespace = row[1] ?? "" const title = row[2] ?? "" const isRedirect = row[3] === "1" if (namespace === "0" && !isRedirect) { wikipediaPagesKeyId[id] = title batch.push(`(${id},E${title})`) } } if (batch.length >= BATCH_SIZE) { sqlInputFileStream.pause() await flushBatch() sqlInputFileStream.resume() } }) .on("error", (error) => { return reject(error) }) .on("close", async () => { await flushBatch() console.log("cleanPagesSQL - Bytes read (100%).") return resolve(wikipediaPagesKeyId) }) }) } const wikipediaPagesKeyId = await cleanPagesSQL() /** * Function to clean the `pagelinks.sql` file by: * - Removing all lines that don't start with `INSERT INTO...`. * - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0. * - Transform the rows to internal links with fromPageId and toPageId. * @returns {Promise} */ const cleanInternalLinksSQL = async () => { let internalLinksFileCount = 1 const INSERT_INTO_START_OUTPUT = "INSERT INTO internal_links (from_page_id, to_page_id) VALUES " /** * @type {WikipediaPagesKeyTitle} */ const wikipediaPagesKeyTitle = swapKeysAndValues(wikipediaPagesKeyId) const INSERT_INTO_START_INPUT = "INSERT INTO `pagelinks` VALUES " const sqlInputPath = path.join(SQL_DUMP_PATH, "pagelinks.sql") const sqlInputStat = await fs.promises.stat(sqlInputPath) const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8") let isInsideInsert = false let current = "" let lastPercent = 0 const BATCH_SIZE = 4_000_000 /** * @type {string[]} */ let batch = [] const flushBatch = async () => { if (batch.length > 0) { const batchString = batch.join(",") const fileName = `${zeroPad(internalLinksFileCount, SQL_FILENAME_NUMBER_PAD)}-internal-links-inserts.sql` const sqlOutputPath = path.join( process.cwd(), "sql-internal-links-inserts", fileName, ) await fs.promises.writeFile( sqlOutputPath, `${INSERT_INTO_START_OUTPUT}${batchString};`, { encoding: "utf-8", }, ) console.log(`flushBatch - ${fileName}, batch.length: ${batch.length}`) internalLinksFileCount += 1 batch = [] } } return await new Promise((resolve, reject) => { sqlInputFileStream .on("data", async (dataInput) => { const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size const bytesReadPercent = bytesReadRatio * 100 if (bytesReadPercent - lastPercent >= 0.5) { console.log( `cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, ) lastPercent = bytesReadPercent } let data = current + dataInput if (!isInsideInsert) { const lines = data.split("\n").filter((line) => { return line.startsWith(INSERT_INTO_START_INPUT) }) const [line] = lines if (line == null) { sqlInputFileStream.close() return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`)) } isInsideInsert = true const lineStripped = line.slice(INSERT_INTO_START_INPUT.length) data = lineStripped } const { rows, unCompleted } = extractRowsFromSQLValues(data) current = unCompleted for (const row of rows) { if (row.length !== 5) { sqlInputFileStream.close() console.error([row]) return reject(new Error(`Invalid Row values.`)) } const plFromPageId = Number.parseInt(row[0] ?? "0", 10) const plTargetNamespace = row[1] ?? "" const plTargetTitle = row[2] ?? "" const plFromNamespace = row[3] ?? "" if (plFromNamespace === "0" && plTargetNamespace === "0") { const toPageId = wikipediaPagesKeyTitle[plTargetTitle] if (toPageId != null && wikipediaPagesKeyId[plFromPageId] != null) { /** * @type {WikipediaInternalLink} */ const wikipediaInternalLink = { fromPageId: plFromPageId, toPageId, } batch.push( `(${wikipediaInternalLink.fromPageId},${wikipediaInternalLink.toPageId})`, ) } } } if (batch.length >= BATCH_SIZE) { sqlInputFileStream.pause() await flushBatch() sqlInputFileStream.resume() } }) .on("error", (error) => { return reject(error) }) .on("close", async () => { await flushBatch() console.log("cleanInternalLinksSQL - Bytes read (100%).") return resolve() }) }) } await cleanInternalLinksSQL()