wikipedia-game-solver/data/database-wikipedia.js

95 lines
3.2 KiB
JavaScript
Raw Normal View History

import fs from "node:fs"
import path from "node:path"
import { extractRowsFromSQLValues } from "./utils.js"
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
/**
* Function to clean the `page.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`.
* - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
* - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
*/
const cleanPagesSQL = async () => {
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES\n"
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
const sqlInputStat = await fs.promises.stat(sqlInputPath)
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT)
let isInsideInsert = false
let current = ""
let lastPercent = 0
return await new Promise((resolve, reject) => {
sqlInputFileStream
.on("data", async (dataInput) => {
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
const bytesReadPercent = bytesReadRatio * 100
if (bytesReadPercent - lastPercent >= 1) {
console.log(
`Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
)
lastPercent = bytesReadPercent
}
/**
* @type {string}
*/
let data = current + dataInput
if (!isInsideInsert) {
const lines = data.split("\n").filter((line) => {
return line.startsWith(INSERT_INTO_START_INPUT)
})
const [line] = lines
if (line == null) {
sqlInputFileStream.close()
return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
}
isInsideInsert = true
const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
data = lineStripped
}
const { rows, unCompleted } = extractRowsFromSQLValues(data)
current = unCompleted
for (const row of rows) {
if (row.length !== 12) {
sqlInputFileStream.close()
console.error([row])
return reject(new Error(`Invalid Row values.`))
}
const id = Number.parseInt(row[0] ?? "0", 10)
const namespace = row[1] ?? ""
const title = row[2] ?? ""
const isRedirect = row[3] === "1"
if (namespace === "0" && !isRedirect) {
await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8")
}
}
})
.on("error", async (error) => {
await sqlOutputFile.close()
return reject(error)
})
.on("close", async () => {
console.log(`Cleaned "${sqlInputPath}" to "${sqlOutputPath}".`)
await sqlOutputFile.appendFile(";\n", "utf-8")
await sqlOutputFile.close()
return resolve()
})
})
}
await cleanPagesSQL()