wikipedia-game-solver/data/database-wikipedia.js

255 lines
8.0 KiB
JavaScript
Raw Normal View History

import fs from "node:fs"
import path from "node:path"
import { extractRowsFromSQLValues, swapKeysAndValues } from "./utils.js"
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
/**
* @typedef {Record<string, number>} WikipediaPagesKeyTitle
*
* Object to store pages from Wikipedia:
* - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
* - Value: page id.
*/
/**
* @typedef {Record<string, number>} WikipediaPagesKeyId
*
* Object to store pages from Wikipedia:
* - Key: page id.
* - Value: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
*/
/**
* @typedef WikipediaInternalLink
* @property {number} fromPageId
* @property {number} toPageId
*/
/**
* Function to clean the `page.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`.
* - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
* - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
* @returns {Promise<WikipediaPagesKeyId>}
*/
const cleanPagesSQL = async () => {
/** @type {WikipediaPagesKeyId} */
const wikipediaPagesKeyId = {}
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
const sqlInputStat = await fs.promises.stat(sqlInputPath)
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
let isInsideInsert = false
let current = ""
let lastPercent = 0
return await new Promise((resolve, reject) => {
sqlInputFileStream
.on("data", (dataInput) => {
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
const bytesReadPercent = bytesReadRatio * 100
if (bytesReadPercent - lastPercent >= 1) {
console.log(
`cleanPagesSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
)
lastPercent = bytesReadPercent
}
let data = current + dataInput
if (!isInsideInsert) {
const lines = data.split("\n").filter((line) => {
return line.startsWith(INSERT_INTO_START_INPUT)
})
const [line] = lines
if (line == null) {
sqlInputFileStream.close()
return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
}
isInsideInsert = true
const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
data = lineStripped
}
const { rows, unCompleted } = extractRowsFromSQLValues(data)
current = unCompleted
for (const row of rows) {
if (row.length !== 12) {
sqlInputFileStream.close()
console.error([row])
return reject(new Error(`Invalid Row values.`))
}
const id = Number.parseInt(row[0] ?? "0", 10)
const namespace = row[1] ?? ""
const title = row[2] ?? ""
const isRedirect = row[3] === "1"
if (namespace === "0" && !isRedirect) {
wikipediaPagesKeyId[id] = title
}
}
})
.on("error", (error) => {
return reject(error)
})
.on("close", () => {
console.log("cleanPagesSQL - Bytes read (100%).")
return resolve(wikipediaPagesKeyId)
})
})
}
const wikipediaPagesKeyId = await cleanPagesSQL()
const cleanPagesSQLWriteToFile = async () => {
console.log("cleanPagesSQLWriteToFile - Writing to file...")
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
const wikipediaPagesString = Object.entries(wikipediaPagesKeyId)
.map(([id, title]) => {
return `(${id},${title})`
})
.join(",")
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
{ encoding: "utf-8" },
)
console.log("cleanPagesSQLWriteToFile - Done.")
}
await cleanPagesSQLWriteToFile()
/**
* Function to clean the `pagelinks.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`.
* - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0.
* - Transform the rows to internal links with fromPageId and toPageId.
* @returns {Promise<WikipediaInternalLink[]>}
*/
const cleanInternalLinksSQL = async () => {
/**
* @type {WikipediaInternalLink[]}
*/
const internalLinks = []
/**
* @type {WikipediaPagesKeyTitle}
*/
const wikipediaPagesKeyTitle = swapKeysAndValues(wikipediaPagesKeyId)
const INSERT_INTO_START_INPUT = "INSERT INTO `pagelinks` VALUES "
const sqlInputPath = path.join(SQL_DUMP_PATH, "pagelinks.sql")
const sqlInputStat = await fs.promises.stat(sqlInputPath)
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
let isInsideInsert = false
let current = ""
let lastPercent = 0
return await new Promise((resolve, reject) => {
sqlInputFileStream
.on("data", (dataInput) => {
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
const bytesReadPercent = bytesReadRatio * 100
if (bytesReadPercent - lastPercent >= 1) {
console.log(
`cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
)
lastPercent = bytesReadPercent
}
let data = current + dataInput
if (!isInsideInsert) {
const lines = data.split("\n").filter((line) => {
return line.startsWith(INSERT_INTO_START_INPUT)
})
const [line] = lines
if (line == null) {
sqlInputFileStream.close()
return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
}
isInsideInsert = true
const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
data = lineStripped
}
const { rows, unCompleted } = extractRowsFromSQLValues(data)
current = unCompleted
for (const row of rows) {
if (row.length !== 5) {
sqlInputFileStream.close()
console.error([row])
return reject(new Error(`Invalid Row values.`))
}
const plFromPageId = Number.parseInt(row[0] ?? "0", 10)
const plTargetNamespace = row[1] ?? ""
const plTargetTitle = row[2] ?? ""
const plFromNamespace = row[3] ?? ""
if (plFromNamespace === "0" && plTargetNamespace === "0") {
if (
wikipediaPagesKeyTitle[plTargetTitle] != null &&
wikipediaPagesKeyId[plFromPageId] != null
) {
/**
* @type {WikipediaInternalLink}
*/
const wikipediaInternalLink = {
fromPageId: plFromPageId,
toPageId: wikipediaPagesKeyTitle[plTargetTitle],
}
internalLinks.push(wikipediaInternalLink)
}
}
}
})
.on("error", (error) => {
return reject(error)
})
.on("close", () => {
console.log("cleanInternalLinksSQL - Bytes read (100%).")
return resolve(internalLinks)
})
})
}
const internalLinks = await cleanInternalLinksSQL()
const cleanInternalLinksSQLWriteToFile = async () => {
console.log("cleanInternalLinksSQLWriteToFile - Writing to file...")
const sqlOutputPath = path.join(
SQL_OUTPUT_PATH,
"3-internal-links-inserts.sql",
)
const INSERT_INTO_START_OUTPUT = "INSERT INTO internal_links VALUES "
const wikipediaPagesString = internalLinks
.map(({ fromPageId, toPageId }) => {
return `(${fromPageId},${toPageId})`
})
.join(",")
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
{ encoding: "utf-8" },
)
console.log("cleanInternalLinksSQLWriteToFile - Done.")
}
await cleanInternalLinksSQLWriteToFile()