This repository has been archived on 2024-10-12. You can view files and clone it, but cannot push or open issues or pull requests.
wikipedia-game-solver/data/generate-sql-files.js

290 lines
8.7 KiB
JavaScript
Raw Permalink Normal View History

import fs from "node:fs"
import path from "node:path"
import {
extractRowsFromSQLValues,
swapKeysAndValues,
zeroPad,
} from "./utils.js"
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
const SQL_FILENAME_NUMBER_PAD = 4
/**
* @typedef {Record<string, number>} WikipediaPagesKeyTitle
*
* Object to store pages from Wikipedia:
* - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
* - Value: page id.
*/
/**
* @typedef {Record<string, number>} WikipediaPagesKeyId
*
* Object to store pages from Wikipedia:
* - Key: page id.
* - Value: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
*/
/**
* @typedef WikipediaInternalLink
* @property {number} fromPageId
* @property {number} toPageId
*/
/**
* Function to clean the `page.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`.
* - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
* - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
* @returns {Promise<WikipediaPagesKeyId>}
*/
const cleanPagesSQL = async () => {
/** @type {WikipediaPagesKeyId} */
const wikipediaPagesKeyId = {}
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
const sqlInputStat = await fs.promises.stat(sqlInputPath)
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
let isInsideInsert = false
let current = ""
let lastPercent = 0
let pagesFileCount = 1
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES "
const BATCH_SIZE = 1_000_000
/**
* @type {string[]}
*/
let batch = []
const flushBatch = async () => {
if (batch.length > 0) {
const batchString = batch.join(",")
const fileName = `${zeroPad(pagesFileCount, SQL_FILENAME_NUMBER_PAD)}-pages-inserts.sql`
const sqlOutputPath = path.join(
process.cwd(),
"sql-pages-inserts",
fileName,
)
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${batchString};`,
{
encoding: "utf-8",
},
)
console.log(`flushBatch - ${fileName}, batch.length: ${batch.length}`)
pagesFileCount += 1
batch = []
}
}
return await new Promise((resolve, reject) => {
sqlInputFileStream
.on("data", async (dataInput) => {
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
const bytesReadPercent = bytesReadRatio * 100
if (bytesReadPercent - lastPercent >= 1) {
console.log(
`cleanPagesSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
)
lastPercent = bytesReadPercent
}
let data = current + dataInput
if (!isInsideInsert) {
const lines = data.split("\n").filter((line) => {
return line.startsWith(INSERT_INTO_START_INPUT)
})
const [line] = lines
if (line == null) {
sqlInputFileStream.close()
return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
}
isInsideInsert = true
const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
data = lineStripped
}
const { rows, unCompleted } = extractRowsFromSQLValues(data)
current = unCompleted
for (const row of rows) {
if (row.length !== 12) {
sqlInputFileStream.close()
console.error([row])
return reject(new Error(`Invalid Row values.`))
}
const id = Number.parseInt(row[0] ?? "0", 10)
const namespace = row[1] ?? ""
const title = row[2] ?? ""
const isRedirect = row[3] === "1"
if (namespace === "0" && !isRedirect) {
wikipediaPagesKeyId[id] = title
batch.push(`(${id},E${title})`)
}
}
if (batch.length >= BATCH_SIZE) {
sqlInputFileStream.pause()
await flushBatch()
sqlInputFileStream.resume()
}
})
.on("error", (error) => {
return reject(error)
})
.on("close", async () => {
await flushBatch()
console.log("cleanPagesSQL - Bytes read (100%).")
return resolve(wikipediaPagesKeyId)
})
})
}
const wikipediaPagesKeyId = await cleanPagesSQL()
/**
* Function to clean the `pagelinks.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`.
* - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0.
* - Transform the rows to internal links with fromPageId and toPageId.
* @returns {Promise<void>}
*/
const cleanInternalLinksSQL = async () => {
let internalLinksFileCount = 1
const INSERT_INTO_START_OUTPUT =
"INSERT INTO internal_links (from_page_id, to_page_id) VALUES "
/**
* @type {WikipediaPagesKeyTitle}
*/
const wikipediaPagesKeyTitle = swapKeysAndValues(wikipediaPagesKeyId)
const INSERT_INTO_START_INPUT = "INSERT INTO `pagelinks` VALUES "
const sqlInputPath = path.join(SQL_DUMP_PATH, "pagelinks.sql")
const sqlInputStat = await fs.promises.stat(sqlInputPath)
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
let isInsideInsert = false
let current = ""
let lastPercent = 0
const BATCH_SIZE = 4_000_000
/**
* @type {string[]}
*/
let batch = []
const flushBatch = async () => {
if (batch.length > 0) {
const batchString = batch.join(",")
const fileName = `${zeroPad(internalLinksFileCount, SQL_FILENAME_NUMBER_PAD)}-internal-links-inserts.sql`
const sqlOutputPath = path.join(
process.cwd(),
"sql-internal-links-inserts",
fileName,
)
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${batchString};`,
{
encoding: "utf-8",
},
)
console.log(`flushBatch - ${fileName}, batch.length: ${batch.length}`)
internalLinksFileCount += 1
batch = []
}
}
return await new Promise((resolve, reject) => {
sqlInputFileStream
.on("data", async (dataInput) => {
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
const bytesReadPercent = bytesReadRatio * 100
if (bytesReadPercent - lastPercent >= 0.5) {
console.log(
`cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
)
lastPercent = bytesReadPercent
}
let data = current + dataInput
if (!isInsideInsert) {
const lines = data.split("\n").filter((line) => {
return line.startsWith(INSERT_INTO_START_INPUT)
})
const [line] = lines
if (line == null) {
sqlInputFileStream.close()
return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
}
isInsideInsert = true
const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
data = lineStripped
}
const { rows, unCompleted } = extractRowsFromSQLValues(data)
current = unCompleted
for (const row of rows) {
if (row.length !== 5) {
sqlInputFileStream.close()
console.error([row])
return reject(new Error(`Invalid Row values.`))
}
const plFromPageId = Number.parseInt(row[0] ?? "0", 10)
const plTargetNamespace = row[1] ?? ""
const plTargetTitle = row[2] ?? ""
const plFromNamespace = row[3] ?? ""
if (plFromNamespace === "0" && plTargetNamespace === "0") {
const toPageId = wikipediaPagesKeyTitle[plTargetTitle]
if (toPageId != null && wikipediaPagesKeyId[plFromPageId] != null) {
/**
* @type {WikipediaInternalLink}
*/
const wikipediaInternalLink = {
fromPageId: plFromPageId,
toPageId,
}
batch.push(
`(${wikipediaInternalLink.fromPageId},${wikipediaInternalLink.toPageId})`,
)
}
}
}
if (batch.length >= BATCH_SIZE) {
sqlInputFileStream.pause()
await flushBatch()
sqlInputFileStream.resume()
}
})
.on("error", (error) => {
return reject(error)
})
.on("close", async () => {
await flushBatch()
console.log("cleanInternalLinksSQL - Bytes read (100%).")
return resolve()
})
})
}
await cleanInternalLinksSQL()