2024-08-05 00:37:06 +02:00
|
|
|
import fs from "node:fs"
|
|
|
|
import path from "node:path"
|
2024-08-07 01:21:08 +02:00
|
|
|
import {
|
|
|
|
extractRowsFromSQLValues,
|
|
|
|
swapKeysAndValues,
|
|
|
|
zeroPad,
|
|
|
|
} from "./utils.js"
|
2024-08-05 00:37:06 +02:00
|
|
|
|
|
|
|
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
|
2024-08-07 01:21:08 +02:00
|
|
|
const SQL_FILENAME_NUMBER_PAD = 4
|
2024-08-05 00:37:06 +02:00
|
|
|
|
2024-08-05 00:52:48 +02:00
|
|
|
/**
|
2024-08-05 14:04:28 +02:00
|
|
|
* @typedef {Record<string, number>} WikipediaPagesKeyTitle
|
2024-08-05 00:52:48 +02:00
|
|
|
*
|
|
|
|
* Object to store pages from Wikipedia:
|
|
|
|
* - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
|
|
|
|
* - Value: page id.
|
|
|
|
*/
|
|
|
|
|
2024-08-05 14:04:28 +02:00
|
|
|
/**
|
|
|
|
* @typedef {Record<string, number>} WikipediaPagesKeyId
|
|
|
|
*
|
|
|
|
* Object to store pages from Wikipedia:
|
|
|
|
* - Key: page id.
|
|
|
|
* - Value: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @typedef WikipediaInternalLink
|
|
|
|
* @property {number} fromPageId
|
|
|
|
* @property {number} toPageId
|
|
|
|
*/
|
|
|
|
|
2024-08-05 00:37:06 +02:00
|
|
|
/**
|
|
|
|
* Function to clean the `page.sql` file by:
|
|
|
|
* - Removing all lines that don't start with `INSERT INTO...`.
|
|
|
|
* - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
|
|
|
|
* - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
|
2024-08-05 14:04:28 +02:00
|
|
|
* @returns {Promise<WikipediaPagesKeyId>}
|
2024-08-05 00:37:06 +02:00
|
|
|
*/
|
|
|
|
const cleanPagesSQL = async () => {
|
2024-08-05 14:04:28 +02:00
|
|
|
/** @type {WikipediaPagesKeyId} */
|
|
|
|
const wikipediaPagesKeyId = {}
|
2024-08-05 00:52:48 +02:00
|
|
|
|
2024-08-05 00:37:06 +02:00
|
|
|
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
|
|
|
|
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
|
|
|
|
const sqlInputStat = await fs.promises.stat(sqlInputPath)
|
|
|
|
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
|
|
|
|
|
|
|
|
let isInsideInsert = false
|
|
|
|
let current = ""
|
|
|
|
let lastPercent = 0
|
|
|
|
|
2024-08-11 10:42:42 +02:00
|
|
|
let pagesFileCount = 1
|
|
|
|
|
|
|
|
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES "
|
|
|
|
|
|
|
|
const BATCH_SIZE = 1_000_000
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @type {string[]}
|
|
|
|
*/
|
|
|
|
let batch = []
|
|
|
|
|
|
|
|
const flushBatch = async () => {
|
|
|
|
if (batch.length > 0) {
|
|
|
|
const batchString = batch.join(",")
|
|
|
|
const fileName = `${zeroPad(pagesFileCount, SQL_FILENAME_NUMBER_PAD)}-pages-inserts.sql`
|
|
|
|
const sqlOutputPath = path.join(
|
|
|
|
process.cwd(),
|
|
|
|
"sql-pages-inserts",
|
|
|
|
fileName,
|
|
|
|
)
|
|
|
|
await fs.promises.writeFile(
|
|
|
|
sqlOutputPath,
|
|
|
|
`${INSERT_INTO_START_OUTPUT}${batchString};`,
|
|
|
|
{
|
|
|
|
encoding: "utf-8",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
console.log(`flushBatch - ${fileName}, batch.length: ${batch.length}`)
|
|
|
|
pagesFileCount += 1
|
|
|
|
batch = []
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-05 00:37:06 +02:00
|
|
|
return await new Promise((resolve, reject) => {
|
|
|
|
sqlInputFileStream
|
2024-08-11 10:42:42 +02:00
|
|
|
.on("data", async (dataInput) => {
|
2024-08-05 00:37:06 +02:00
|
|
|
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
|
|
|
|
const bytesReadPercent = bytesReadRatio * 100
|
|
|
|
|
|
|
|
if (bytesReadPercent - lastPercent >= 1) {
|
|
|
|
console.log(
|
2024-08-05 14:04:28 +02:00
|
|
|
`cleanPagesSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
|
2024-08-05 00:37:06 +02:00
|
|
|
)
|
|
|
|
lastPercent = bytesReadPercent
|
|
|
|
}
|
|
|
|
|
|
|
|
let data = current + dataInput
|
|
|
|
|
|
|
|
if (!isInsideInsert) {
|
|
|
|
const lines = data.split("\n").filter((line) => {
|
|
|
|
return line.startsWith(INSERT_INTO_START_INPUT)
|
|
|
|
})
|
|
|
|
const [line] = lines
|
|
|
|
if (line == null) {
|
|
|
|
sqlInputFileStream.close()
|
|
|
|
return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
|
|
|
|
}
|
|
|
|
isInsideInsert = true
|
|
|
|
const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
|
|
|
|
data = lineStripped
|
|
|
|
}
|
|
|
|
|
|
|
|
const { rows, unCompleted } = extractRowsFromSQLValues(data)
|
|
|
|
current = unCompleted
|
|
|
|
|
|
|
|
for (const row of rows) {
|
|
|
|
if (row.length !== 12) {
|
|
|
|
sqlInputFileStream.close()
|
|
|
|
console.error([row])
|
|
|
|
return reject(new Error(`Invalid Row values.`))
|
|
|
|
}
|
|
|
|
|
|
|
|
const id = Number.parseInt(row[0] ?? "0", 10)
|
|
|
|
const namespace = row[1] ?? ""
|
|
|
|
const title = row[2] ?? ""
|
|
|
|
const isRedirect = row[3] === "1"
|
|
|
|
|
|
|
|
if (namespace === "0" && !isRedirect) {
|
2024-08-05 14:04:28 +02:00
|
|
|
wikipediaPagesKeyId[id] = title
|
2024-08-11 10:42:42 +02:00
|
|
|
batch.push(`(${id},E${title})`)
|
2024-08-05 00:37:06 +02:00
|
|
|
}
|
|
|
|
}
|
2024-08-11 10:42:42 +02:00
|
|
|
|
|
|
|
if (batch.length >= BATCH_SIZE) {
|
|
|
|
sqlInputFileStream.pause()
|
|
|
|
await flushBatch()
|
|
|
|
sqlInputFileStream.resume()
|
|
|
|
}
|
2024-08-05 00:37:06 +02:00
|
|
|
})
|
2024-08-05 00:52:48 +02:00
|
|
|
.on("error", (error) => {
|
2024-08-05 00:37:06 +02:00
|
|
|
return reject(error)
|
|
|
|
})
|
2024-08-11 10:42:42 +02:00
|
|
|
.on("close", async () => {
|
|
|
|
await flushBatch()
|
2024-08-05 14:04:28 +02:00
|
|
|
console.log("cleanPagesSQL - Bytes read (100%).")
|
|
|
|
return resolve(wikipediaPagesKeyId)
|
2024-08-05 00:37:06 +02:00
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2024-08-05 14:04:28 +02:00
|
|
|
const wikipediaPagesKeyId = await cleanPagesSQL()
|
2024-08-05 00:52:48 +02:00
|
|
|
|
2024-08-05 14:04:28 +02:00
|
|
|
/**
|
|
|
|
* Function to clean the `pagelinks.sql` file by:
|
|
|
|
* - Removing all lines that don't start with `INSERT INTO...`.
|
|
|
|
* - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0.
|
|
|
|
* - Transform the rows to internal links with fromPageId and toPageId.
|
2024-08-05 17:36:19 +02:00
|
|
|
* @returns {Promise<void>}
|
2024-08-05 14:04:28 +02:00
|
|
|
*/
|
|
|
|
const cleanInternalLinksSQL = async () => {
|
2024-08-11 10:42:42 +02:00
|
|
|
let internalLinksFileCount = 1
|
2024-08-07 01:21:08 +02:00
|
|
|
const INSERT_INTO_START_OUTPUT =
|
|
|
|
"INSERT INTO internal_links (from_page_id, to_page_id) VALUES "
|
|
|
|
|
2024-08-05 14:04:28 +02:00
|
|
|
/**
|
|
|
|
* @type {WikipediaPagesKeyTitle}
|
|
|
|
*/
|
|
|
|
const wikipediaPagesKeyTitle = swapKeysAndValues(wikipediaPagesKeyId)
|
|
|
|
|
|
|
|
const INSERT_INTO_START_INPUT = "INSERT INTO `pagelinks` VALUES "
|
|
|
|
const sqlInputPath = path.join(SQL_DUMP_PATH, "pagelinks.sql")
|
|
|
|
const sqlInputStat = await fs.promises.stat(sqlInputPath)
|
|
|
|
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
|
|
|
|
|
|
|
|
let isInsideInsert = false
|
|
|
|
let current = ""
|
|
|
|
let lastPercent = 0
|
|
|
|
|
2024-08-07 01:21:08 +02:00
|
|
|
const BATCH_SIZE = 4_000_000
|
2024-08-05 17:36:19 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @type {string[]}
|
|
|
|
*/
|
|
|
|
let batch = []
|
|
|
|
|
2024-08-07 01:21:08 +02:00
|
|
|
const flushBatch = async () => {
|
2024-08-05 17:36:19 +02:00
|
|
|
if (batch.length > 0) {
|
2024-08-07 01:21:08 +02:00
|
|
|
const batchString = batch.join(",")
|
|
|
|
const fileName = `${zeroPad(internalLinksFileCount, SQL_FILENAME_NUMBER_PAD)}-internal-links-inserts.sql`
|
2024-08-11 10:42:42 +02:00
|
|
|
const sqlOutputPath = path.join(
|
|
|
|
process.cwd(),
|
|
|
|
"sql-internal-links-inserts",
|
|
|
|
fileName,
|
|
|
|
)
|
2024-08-07 01:21:08 +02:00
|
|
|
await fs.promises.writeFile(
|
|
|
|
sqlOutputPath,
|
|
|
|
`${INSERT_INTO_START_OUTPUT}${batchString};`,
|
|
|
|
{
|
|
|
|
encoding: "utf-8",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
console.log(`flushBatch - ${fileName}, batch.length: ${batch.length}`)
|
|
|
|
internalLinksFileCount += 1
|
2024-08-05 17:36:19 +02:00
|
|
|
batch = []
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-05 14:04:28 +02:00
|
|
|
return await new Promise((resolve, reject) => {
|
|
|
|
sqlInputFileStream
|
2024-08-05 17:36:19 +02:00
|
|
|
.on("data", async (dataInput) => {
|
2024-08-05 14:04:28 +02:00
|
|
|
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
|
|
|
|
const bytesReadPercent = bytesReadRatio * 100
|
|
|
|
|
2024-08-05 17:36:19 +02:00
|
|
|
if (bytesReadPercent - lastPercent >= 0.5) {
|
2024-08-05 14:04:28 +02:00
|
|
|
console.log(
|
|
|
|
`cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
|
|
|
|
)
|
|
|
|
lastPercent = bytesReadPercent
|
|
|
|
}
|
|
|
|
|
|
|
|
let data = current + dataInput
|
|
|
|
|
|
|
|
if (!isInsideInsert) {
|
|
|
|
const lines = data.split("\n").filter((line) => {
|
|
|
|
return line.startsWith(INSERT_INTO_START_INPUT)
|
|
|
|
})
|
|
|
|
const [line] = lines
|
|
|
|
if (line == null) {
|
|
|
|
sqlInputFileStream.close()
|
|
|
|
return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
|
|
|
|
}
|
|
|
|
isInsideInsert = true
|
|
|
|
const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
|
|
|
|
data = lineStripped
|
|
|
|
}
|
|
|
|
|
|
|
|
const { rows, unCompleted } = extractRowsFromSQLValues(data)
|
|
|
|
current = unCompleted
|
|
|
|
|
|
|
|
for (const row of rows) {
|
|
|
|
if (row.length !== 5) {
|
|
|
|
sqlInputFileStream.close()
|
|
|
|
console.error([row])
|
|
|
|
return reject(new Error(`Invalid Row values.`))
|
|
|
|
}
|
|
|
|
|
|
|
|
const plFromPageId = Number.parseInt(row[0] ?? "0", 10)
|
|
|
|
const plTargetNamespace = row[1] ?? ""
|
|
|
|
const plTargetTitle = row[2] ?? ""
|
|
|
|
const plFromNamespace = row[3] ?? ""
|
|
|
|
|
|
|
|
if (plFromNamespace === "0" && plTargetNamespace === "0") {
|
2024-08-05 17:36:19 +02:00
|
|
|
const toPageId = wikipediaPagesKeyTitle[plTargetTitle]
|
2024-08-07 01:21:08 +02:00
|
|
|
if (toPageId != null && wikipediaPagesKeyId[plFromPageId] != null) {
|
2024-08-05 14:04:28 +02:00
|
|
|
/**
|
|
|
|
* @type {WikipediaInternalLink}
|
|
|
|
*/
|
|
|
|
const wikipediaInternalLink = {
|
|
|
|
fromPageId: plFromPageId,
|
2024-08-05 17:36:19 +02:00
|
|
|
toPageId,
|
|
|
|
}
|
|
|
|
batch.push(
|
|
|
|
`(${wikipediaInternalLink.fromPageId},${wikipediaInternalLink.toPageId})`,
|
|
|
|
)
|
2024-08-05 14:04:28 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-08-07 01:21:08 +02:00
|
|
|
|
|
|
|
if (batch.length >= BATCH_SIZE) {
|
|
|
|
sqlInputFileStream.pause()
|
|
|
|
await flushBatch()
|
|
|
|
sqlInputFileStream.resume()
|
|
|
|
}
|
2024-08-05 14:04:28 +02:00
|
|
|
})
|
|
|
|
.on("error", (error) => {
|
|
|
|
return reject(error)
|
|
|
|
})
|
2024-08-05 17:36:19 +02:00
|
|
|
.on("close", async () => {
|
2024-08-07 01:21:08 +02:00
|
|
|
await flushBatch()
|
2024-08-05 14:04:28 +02:00
|
|
|
console.log("cleanInternalLinksSQL - Bytes read (100%).")
|
2024-08-05 17:36:19 +02:00
|
|
|
return resolve()
|
2024-08-05 14:04:28 +02:00
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2024-08-05 17:36:19 +02:00
|
|
|
await cleanInternalLinksSQL()
|