From 3bed3e0578effc7f2a888c06399dc79124320d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20LUDWIG?= Date: Mon, 5 Aug 2024 17:36:19 +0200 Subject: [PATCH] perf: improve memory usage for POC to get wikipedia dump --- TODO.md | 2 +- data/README.md | 2 + data/database-wikipedia.js | 81 +++++++++++++++++------------------- data/sql/1-tables-create.sql | 9 ++-- 4 files changed, 48 insertions(+), 46 deletions(-) diff --git a/TODO.md b/TODO.md index fd3bd85..112fe18 100644 --- a/TODO.md +++ b/TODO.md @@ -11,7 +11,7 @@ - [x] Adapt downloaded SQL files - [x] `page.sql` (`pages` tables) - [x] `pagelinks.sql` (`internal_links` tables) - - [x] Import SQL files + - [ ] Import SQL files => Investigate why there is an error when importing - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page` - [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder - [ ] Documentation how to use + Last execution date diff --git a/data/README.md b/data/README.md index bb7cef7..39f8ec5 100644 --- a/data/README.md +++ b/data/README.md @@ -9,6 +9,8 @@ node --max-old-space-size=10096 database-wikipedia.js Show the first 10 line of sql file: `head -n 10 ./dump/page.sql` +Show the first 10 characters of sql file: `head -c 10 ./dump/page.sql` + To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'` ## Remove a volume diff --git a/data/database-wikipedia.js b/data/database-wikipedia.js index 3d542e8..e33582d 100644 --- a/data/database-wikipedia.js +++ b/data/database-wikipedia.js @@ -111,7 +111,7 @@ const wikipediaPagesKeyId = await cleanPagesSQL() const cleanPagesSQLWriteToFile = async () => { console.log("cleanPagesSQLWriteToFile - Writing to file...") const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") - const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES " + const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES " const wikipediaPagesString = Object.entries(wikipediaPagesKeyId) .map(([id, title]) => { @@ -129,19 +129,20 @@ const cleanPagesSQLWriteToFile = async () => { await cleanPagesSQLWriteToFile() +const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "3-internal-links-inserts.sql") +const INSERT_INTO_START_OUTPUT = + "INSERT INTO internal_links (from_page_id, to_page_id) VALUES " +const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w") +await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT, "utf-8") + /** * Function to clean the `pagelinks.sql` file by: * - Removing all lines that don't start with `INSERT INTO...`. * - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0. * - Transform the rows to internal links with fromPageId and toPageId. - * @returns {Promise} + * @returns {Promise} */ const cleanInternalLinksSQL = async () => { - /** - * @type {WikipediaInternalLink[]} - */ - const internalLinks = [] - /** * @type {WikipediaPagesKeyTitle} */ @@ -156,13 +157,28 @@ const cleanInternalLinksSQL = async () => { let current = "" let lastPercent = 0 + const BATCH_SIZE = 10_000 + + /** + * @type {string[]} + */ + let batch = [] + + const flushBatch = async (isLast = false) => { + if (batch.length > 0) { + const batchString = batch.join(",") + (isLast ? ";" : ",") + await sqlOutputFile.appendFile(batchString, "utf-8") + batch = [] + } + } + return await new Promise((resolve, reject) => { sqlInputFileStream - .on("data", (dataInput) => { + .on("data", async (dataInput) => { const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size const bytesReadPercent = bytesReadRatio * 100 - if (bytesReadPercent - lastPercent >= 1) { + if (bytesReadPercent - lastPercent >= 0.5) { console.log( `cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, ) @@ -201,18 +217,22 @@ const cleanInternalLinksSQL = async () => { const plFromNamespace = row[3] ?? "" if (plFromNamespace === "0" && plTargetNamespace === "0") { - if ( - wikipediaPagesKeyTitle[plTargetTitle] != null && - wikipediaPagesKeyId[plFromPageId] != null - ) { + const toPageId = wikipediaPagesKeyTitle[plTargetTitle] + if (toPageId != null) { /** * @type {WikipediaInternalLink} */ const wikipediaInternalLink = { fromPageId: plFromPageId, - toPageId: wikipediaPagesKeyTitle[plTargetTitle], + toPageId, + } + batch.push( + `(${wikipediaInternalLink.fromPageId},${wikipediaInternalLink.toPageId})`, + ) + + if (batch.length >= BATCH_SIZE) { + await flushBatch() } - internalLinks.push(wikipediaInternalLink) } } } @@ -220,35 +240,12 @@ const cleanInternalLinksSQL = async () => { .on("error", (error) => { return reject(error) }) - .on("close", () => { + .on("close", async () => { + await flushBatch(true) console.log("cleanInternalLinksSQL - Bytes read (100%).") - return resolve(internalLinks) + return resolve() }) }) } -const internalLinks = await cleanInternalLinksSQL() - -const cleanInternalLinksSQLWriteToFile = async () => { - console.log("cleanInternalLinksSQLWriteToFile - Writing to file...") - const sqlOutputPath = path.join( - SQL_OUTPUT_PATH, - "3-internal-links-inserts.sql", - ) - const INSERT_INTO_START_OUTPUT = "INSERT INTO internal_links VALUES " - - const wikipediaPagesString = internalLinks - .map(({ fromPageId, toPageId }) => { - return `(${fromPageId},${toPageId})` - }) - .join(",") - - await fs.promises.writeFile( - sqlOutputPath, - `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`, - { encoding: "utf-8" }, - ) - console.log("cleanInternalLinksSQLWriteToFile - Done.") -} - -await cleanInternalLinksSQLWriteToFile() +await cleanInternalLinksSQL() diff --git a/data/sql/1-tables-create.sql b/data/sql/1-tables-create.sql index faaddbc..b1efa4a 100644 --- a/data/sql/1-tables-create.sql +++ b/data/sql/1-tables-create.sql @@ -1,6 +1,7 @@ CREATE TABLE `pages` ( `id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT, `title` VARBINARY(255) NOT NULL DEFAULT '', + -- `is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0, PRIMARY KEY (`id`), UNIQUE KEY (`title`) @@ -10,10 +11,12 @@ CREATE TABLE `pages` ( -- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config. CREATE TABLE `internal_links` ( + `id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT, `from_page_id` INT(8) UNSIGNED NOT NULL, `to_page_id` INT(8) UNSIGNED NOT NULL, - PRIMARY KEY (`from_page_id`, `to_page_id`), - FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE, - FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE + PRIMARY KEY (`id`) + -- PRIMARY KEY (`from_page_id`, `to_page_id`), + -- FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE, + -- FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE ) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;