perf: improve memory usage for POC to get wikipedia dump

This commit is contained in:
Théo LUDWIG 2024-08-05 17:36:19 +02:00
parent fee0b4e681
commit 3bed3e0578
Signed by: theoludwig
GPG Key ID: ADFE5A563D718F3B
4 changed files with 48 additions and 46 deletions

View File

@ -11,7 +11,7 @@
- [x] Adapt downloaded SQL files
- [x] `page.sql` (`pages` tables)
- [x] `pagelinks.sql` (`internal_links` tables)
- [x] Import SQL files
- [ ] Import SQL files => Investigate why there is an error when importing
- [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page`
- [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder
- [ ] Documentation how to use + Last execution date

View File

@ -9,6 +9,8 @@ node --max-old-space-size=10096 database-wikipedia.js
Show the first 10 line of sql file: `head -n 10 ./dump/page.sql`
Show the first 10 characters of sql file: `head -c 10 ./dump/page.sql`
To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'`
## Remove a volume

View File

@ -111,7 +111,7 @@ const wikipediaPagesKeyId = await cleanPagesSQL()
const cleanPagesSQLWriteToFile = async () => {
console.log("cleanPagesSQLWriteToFile - Writing to file...")
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES "
const wikipediaPagesString = Object.entries(wikipediaPagesKeyId)
.map(([id, title]) => {
@ -129,19 +129,20 @@ const cleanPagesSQLWriteToFile = async () => {
await cleanPagesSQLWriteToFile()
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "3-internal-links-inserts.sql")
const INSERT_INTO_START_OUTPUT =
"INSERT INTO internal_links (from_page_id, to_page_id) VALUES "
const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT, "utf-8")
/**
* Function to clean the `pagelinks.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`.
* - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0.
* - Transform the rows to internal links with fromPageId and toPageId.
* @returns {Promise<WikipediaInternalLink[]>}
* @returns {Promise<void>}
*/
const cleanInternalLinksSQL = async () => {
/**
* @type {WikipediaInternalLink[]}
*/
const internalLinks = []
/**
* @type {WikipediaPagesKeyTitle}
*/
@ -156,13 +157,28 @@ const cleanInternalLinksSQL = async () => {
let current = ""
let lastPercent = 0
const BATCH_SIZE = 10_000
/**
* @type {string[]}
*/
let batch = []
const flushBatch = async (isLast = false) => {
if (batch.length > 0) {
const batchString = batch.join(",") + (isLast ? ";" : ",")
await sqlOutputFile.appendFile(batchString, "utf-8")
batch = []
}
}
return await new Promise((resolve, reject) => {
sqlInputFileStream
.on("data", (dataInput) => {
.on("data", async (dataInput) => {
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
const bytesReadPercent = bytesReadRatio * 100
if (bytesReadPercent - lastPercent >= 1) {
if (bytesReadPercent - lastPercent >= 0.5) {
console.log(
`cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
)
@ -201,18 +217,22 @@ const cleanInternalLinksSQL = async () => {
const plFromNamespace = row[3] ?? ""
if (plFromNamespace === "0" && plTargetNamespace === "0") {
if (
wikipediaPagesKeyTitle[plTargetTitle] != null &&
wikipediaPagesKeyId[plFromPageId] != null
) {
const toPageId = wikipediaPagesKeyTitle[plTargetTitle]
if (toPageId != null) {
/**
* @type {WikipediaInternalLink}
*/
const wikipediaInternalLink = {
fromPageId: plFromPageId,
toPageId: wikipediaPagesKeyTitle[plTargetTitle],
toPageId,
}
batch.push(
`(${wikipediaInternalLink.fromPageId},${wikipediaInternalLink.toPageId})`,
)
if (batch.length >= BATCH_SIZE) {
await flushBatch()
}
internalLinks.push(wikipediaInternalLink)
}
}
}
@ -220,35 +240,12 @@ const cleanInternalLinksSQL = async () => {
.on("error", (error) => {
return reject(error)
})
.on("close", () => {
.on("close", async () => {
await flushBatch(true)
console.log("cleanInternalLinksSQL - Bytes read (100%).")
return resolve(internalLinks)
return resolve()
})
})
}
const internalLinks = await cleanInternalLinksSQL()
const cleanInternalLinksSQLWriteToFile = async () => {
console.log("cleanInternalLinksSQLWriteToFile - Writing to file...")
const sqlOutputPath = path.join(
SQL_OUTPUT_PATH,
"3-internal-links-inserts.sql",
)
const INSERT_INTO_START_OUTPUT = "INSERT INTO internal_links VALUES "
const wikipediaPagesString = internalLinks
.map(({ fromPageId, toPageId }) => {
return `(${fromPageId},${toPageId})`
})
.join(",")
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
{ encoding: "utf-8" },
)
console.log("cleanInternalLinksSQLWriteToFile - Done.")
}
await cleanInternalLinksSQLWriteToFile()
await cleanInternalLinksSQL()

View File

@ -1,6 +1,7 @@
CREATE TABLE `pages` (
`id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT,
`title` VARBINARY(255) NOT NULL DEFAULT '',
-- `is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
PRIMARY KEY (`id`),
UNIQUE KEY (`title`)
@ -10,10 +11,12 @@ CREATE TABLE `pages` (
-- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config.
CREATE TABLE `internal_links` (
`id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT,
`from_page_id` INT(8) UNSIGNED NOT NULL,
`to_page_id` INT(8) UNSIGNED NOT NULL,
PRIMARY KEY (`from_page_id`, `to_page_id`),
FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE,
FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE
PRIMARY KEY (`id`)
-- PRIMARY KEY (`from_page_id`, `to_page_id`),
-- FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE,
-- FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;