perf: improve memory usage for POC to get wikipedia dump

This commit is contained in:
Théo LUDWIG 2024-08-05 17:36:19 +02:00
parent fee0b4e681
commit 3bed3e0578
Signed by: theoludwig
GPG Key ID: ADFE5A563D718F3B
4 changed files with 48 additions and 46 deletions

View File

@ -11,7 +11,7 @@
- [x] Adapt downloaded SQL files - [x] Adapt downloaded SQL files
- [x] `page.sql` (`pages` tables) - [x] `page.sql` (`pages` tables)
- [x] `pagelinks.sql` (`internal_links` tables) - [x] `pagelinks.sql` (`internal_links` tables)
- [x] Import SQL files - [ ] Import SQL files => Investigate why there is an error when importing
- [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page` - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page`
- [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder - [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder
- [ ] Documentation how to use + Last execution date - [ ] Documentation how to use + Last execution date

View File

@ -9,6 +9,8 @@ node --max-old-space-size=10096 database-wikipedia.js
Show the first 10 line of sql file: `head -n 10 ./dump/page.sql` Show the first 10 line of sql file: `head -n 10 ./dump/page.sql`
Show the first 10 characters of sql file: `head -c 10 ./dump/page.sql`
To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'` To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'`
## Remove a volume ## Remove a volume

View File

@ -111,7 +111,7 @@ const wikipediaPagesKeyId = await cleanPagesSQL()
const cleanPagesSQLWriteToFile = async () => { const cleanPagesSQLWriteToFile = async () => {
console.log("cleanPagesSQLWriteToFile - Writing to file...") console.log("cleanPagesSQLWriteToFile - Writing to file...")
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES " const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES "
const wikipediaPagesString = Object.entries(wikipediaPagesKeyId) const wikipediaPagesString = Object.entries(wikipediaPagesKeyId)
.map(([id, title]) => { .map(([id, title]) => {
@ -129,19 +129,20 @@ const cleanPagesSQLWriteToFile = async () => {
await cleanPagesSQLWriteToFile() await cleanPagesSQLWriteToFile()
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "3-internal-links-inserts.sql")
const INSERT_INTO_START_OUTPUT =
"INSERT INTO internal_links (from_page_id, to_page_id) VALUES "
const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT, "utf-8")
/** /**
* Function to clean the `pagelinks.sql` file by: * Function to clean the `pagelinks.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`. * - Removing all lines that don't start with `INSERT INTO...`.
* - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0. * - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0.
* - Transform the rows to internal links with fromPageId and toPageId. * - Transform the rows to internal links with fromPageId and toPageId.
* @returns {Promise<WikipediaInternalLink[]>} * @returns {Promise<void>}
*/ */
const cleanInternalLinksSQL = async () => { const cleanInternalLinksSQL = async () => {
/**
* @type {WikipediaInternalLink[]}
*/
const internalLinks = []
/** /**
* @type {WikipediaPagesKeyTitle} * @type {WikipediaPagesKeyTitle}
*/ */
@ -156,13 +157,28 @@ const cleanInternalLinksSQL = async () => {
let current = "" let current = ""
let lastPercent = 0 let lastPercent = 0
const BATCH_SIZE = 10_000
/**
* @type {string[]}
*/
let batch = []
const flushBatch = async (isLast = false) => {
if (batch.length > 0) {
const batchString = batch.join(",") + (isLast ? ";" : ",")
await sqlOutputFile.appendFile(batchString, "utf-8")
batch = []
}
}
return await new Promise((resolve, reject) => { return await new Promise((resolve, reject) => {
sqlInputFileStream sqlInputFileStream
.on("data", (dataInput) => { .on("data", async (dataInput) => {
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
const bytesReadPercent = bytesReadRatio * 100 const bytesReadPercent = bytesReadRatio * 100
if (bytesReadPercent - lastPercent >= 1) { if (bytesReadPercent - lastPercent >= 0.5) {
console.log( console.log(
`cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, `cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
) )
@ -201,18 +217,22 @@ const cleanInternalLinksSQL = async () => {
const plFromNamespace = row[3] ?? "" const plFromNamespace = row[3] ?? ""
if (plFromNamespace === "0" && plTargetNamespace === "0") { if (plFromNamespace === "0" && plTargetNamespace === "0") {
if ( const toPageId = wikipediaPagesKeyTitle[plTargetTitle]
wikipediaPagesKeyTitle[plTargetTitle] != null && if (toPageId != null) {
wikipediaPagesKeyId[plFromPageId] != null
) {
/** /**
* @type {WikipediaInternalLink} * @type {WikipediaInternalLink}
*/ */
const wikipediaInternalLink = { const wikipediaInternalLink = {
fromPageId: plFromPageId, fromPageId: plFromPageId,
toPageId: wikipediaPagesKeyTitle[plTargetTitle], toPageId,
}
batch.push(
`(${wikipediaInternalLink.fromPageId},${wikipediaInternalLink.toPageId})`,
)
if (batch.length >= BATCH_SIZE) {
await flushBatch()
} }
internalLinks.push(wikipediaInternalLink)
} }
} }
} }
@ -220,35 +240,12 @@ const cleanInternalLinksSQL = async () => {
.on("error", (error) => { .on("error", (error) => {
return reject(error) return reject(error)
}) })
.on("close", () => { .on("close", async () => {
await flushBatch(true)
console.log("cleanInternalLinksSQL - Bytes read (100%).") console.log("cleanInternalLinksSQL - Bytes read (100%).")
return resolve(internalLinks) return resolve()
}) })
}) })
} }
const internalLinks = await cleanInternalLinksSQL() await cleanInternalLinksSQL()
const cleanInternalLinksSQLWriteToFile = async () => {
console.log("cleanInternalLinksSQLWriteToFile - Writing to file...")
const sqlOutputPath = path.join(
SQL_OUTPUT_PATH,
"3-internal-links-inserts.sql",
)
const INSERT_INTO_START_OUTPUT = "INSERT INTO internal_links VALUES "
const wikipediaPagesString = internalLinks
.map(({ fromPageId, toPageId }) => {
return `(${fromPageId},${toPageId})`
})
.join(",")
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
{ encoding: "utf-8" },
)
console.log("cleanInternalLinksSQLWriteToFile - Done.")
}
await cleanInternalLinksSQLWriteToFile()

View File

@ -1,6 +1,7 @@
CREATE TABLE `pages` ( CREATE TABLE `pages` (
`id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT, `id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT,
`title` VARBINARY(255) NOT NULL DEFAULT '', `title` VARBINARY(255) NOT NULL DEFAULT '',
-- `is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
PRIMARY KEY (`id`), PRIMARY KEY (`id`),
UNIQUE KEY (`title`) UNIQUE KEY (`title`)
@ -10,10 +11,12 @@ CREATE TABLE `pages` (
-- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config. -- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config.
CREATE TABLE `internal_links` ( CREATE TABLE `internal_links` (
`id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT,
`from_page_id` INT(8) UNSIGNED NOT NULL, `from_page_id` INT(8) UNSIGNED NOT NULL,
`to_page_id` INT(8) UNSIGNED NOT NULL, `to_page_id` INT(8) UNSIGNED NOT NULL,
PRIMARY KEY (`from_page_id`, `to_page_id`), PRIMARY KEY (`id`)
FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE, -- PRIMARY KEY (`from_page_id`, `to_page_id`),
FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE -- FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE,
-- FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; ) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;