wikipedia-game-solver/data/database-wikipedia-v2.js

114 lines
3.6 KiB
JavaScript
Raw Normal View History

import fs from "node:fs"
import path from "node:path"
import { extractRowsFromSQLValues } from "./utils.js"
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
/**
* @typedef {Record<string, number>} WikipediaPages
*
* Object to store pages from Wikipedia:
* - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
* - Value: page id.
*/
/**
* Function to clean the `page.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`.
* - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
* - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
* @returns {Promise<WikipediaPages>}
*/
const cleanPagesSQL = async () => {
/** @type {WikipediaPages} */
const wikipediaPages = {}
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
const sqlInputStat = await fs.promises.stat(sqlInputPath)
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
let isInsideInsert = false
let current = ""
let lastPercent = 0
return await new Promise((resolve, reject) => {
sqlInputFileStream
.on("data", (dataInput) => {
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
const bytesReadPercent = bytesReadRatio * 100
if (bytesReadPercent - lastPercent >= 1) {
console.log(
`Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
)
lastPercent = bytesReadPercent
}
let data = current + dataInput
if (!isInsideInsert) {
const lines = data.split("\n").filter((line) => {
return line.startsWith(INSERT_INTO_START_INPUT)
})
const [line] = lines
if (line == null) {
sqlInputFileStream.close()
return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
}
isInsideInsert = true
const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
data = lineStripped
}
const { rows, unCompleted } = extractRowsFromSQLValues(data)
current = unCompleted
for (const row of rows) {
if (row.length !== 12) {
sqlInputFileStream.close()
console.error([row])
return reject(new Error(`Invalid Row values.`))
}
const id = Number.parseInt(row[0] ?? "0", 10)
const namespace = row[1] ?? ""
const title = row[2] ?? ""
const isRedirect = row[3] === "1"
if (namespace === "0" && !isRedirect) {
wikipediaPages[title] = id
}
}
})
.on("error", (error) => {
return reject(error)
})
.on("close", () => {
return resolve(wikipediaPages)
})
})
}
const wikipediaPages = await cleanPagesSQL()
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
const wikipediaPagesString = Object.entries(wikipediaPages)
.map(([title, id]) => {
return `(${id},${title})`
})
.join(",")
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
{ encoding: "utf-8" },
)
// const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
// await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT)
// await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8")