114 lines
3.6 KiB
JavaScript
114 lines
3.6 KiB
JavaScript
import fs from "node:fs"
|
|
import path from "node:path"
|
|
import { extractRowsFromSQLValues } from "./utils.js"
|
|
|
|
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
|
|
const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
|
|
|
|
/**
|
|
* @typedef {Record<string, number>} WikipediaPages
|
|
*
|
|
* Object to store pages from Wikipedia:
|
|
* - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
|
|
* - Value: page id.
|
|
*/
|
|
|
|
/**
|
|
* Function to clean the `page.sql` file by:
|
|
* - Removing all lines that don't start with `INSERT INTO...`.
|
|
* - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
|
|
* - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
|
|
* @returns {Promise<WikipediaPages>}
|
|
*/
|
|
const cleanPagesSQL = async () => {
|
|
/** @type {WikipediaPages} */
|
|
const wikipediaPages = {}
|
|
|
|
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
|
|
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
|
|
const sqlInputStat = await fs.promises.stat(sqlInputPath)
|
|
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
|
|
|
|
let isInsideInsert = false
|
|
let current = ""
|
|
let lastPercent = 0
|
|
|
|
return await new Promise((resolve, reject) => {
|
|
sqlInputFileStream
|
|
.on("data", (dataInput) => {
|
|
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
|
|
const bytesReadPercent = bytesReadRatio * 100
|
|
|
|
if (bytesReadPercent - lastPercent >= 1) {
|
|
console.log(
|
|
`Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
|
|
)
|
|
lastPercent = bytesReadPercent
|
|
}
|
|
|
|
let data = current + dataInput
|
|
|
|
if (!isInsideInsert) {
|
|
const lines = data.split("\n").filter((line) => {
|
|
return line.startsWith(INSERT_INTO_START_INPUT)
|
|
})
|
|
const [line] = lines
|
|
if (line == null) {
|
|
sqlInputFileStream.close()
|
|
return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
|
|
}
|
|
isInsideInsert = true
|
|
const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
|
|
data = lineStripped
|
|
}
|
|
|
|
const { rows, unCompleted } = extractRowsFromSQLValues(data)
|
|
current = unCompleted
|
|
|
|
for (const row of rows) {
|
|
if (row.length !== 12) {
|
|
sqlInputFileStream.close()
|
|
console.error([row])
|
|
return reject(new Error(`Invalid Row values.`))
|
|
}
|
|
|
|
const id = Number.parseInt(row[0] ?? "0", 10)
|
|
const namespace = row[1] ?? ""
|
|
const title = row[2] ?? ""
|
|
const isRedirect = row[3] === "1"
|
|
|
|
if (namespace === "0" && !isRedirect) {
|
|
wikipediaPages[title] = id
|
|
}
|
|
}
|
|
})
|
|
.on("error", (error) => {
|
|
return reject(error)
|
|
})
|
|
.on("close", () => {
|
|
return resolve(wikipediaPages)
|
|
})
|
|
})
|
|
}
|
|
|
|
const wikipediaPages = await cleanPagesSQL()
|
|
|
|
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
|
|
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
|
|
|
|
const wikipediaPagesString = Object.entries(wikipediaPages)
|
|
.map(([title, id]) => {
|
|
return `(${id},${title})`
|
|
})
|
|
.join(",")
|
|
|
|
await fs.promises.writeFile(
|
|
sqlOutputPath,
|
|
`${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
|
|
{ encoding: "utf-8" },
|
|
)
|
|
|
|
// const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
|
|
// await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT)
|
|
// await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8")
|