From 8dec198afe929aeda849b64d7b1b922141421f14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20LUDWIG?= Date: Wed, 7 Aug 2024 00:21:08 +0100 Subject: [PATCH] fix: wikipedia data dump POC improvements --- .gitignore | 4 +- TODO.md | 2 +- data/README.md | 8 ++- data/database-wikipedia.js | 54 ++++++++++++------- ...bles-create.sql => 0000-tables-create.sql} | 10 ++-- data/utils.js | 12 +++++ 6 files changed, 63 insertions(+), 27 deletions(-) rename data/sql/{1-tables-create.sql => 0000-tables-create.sql} (74%) diff --git a/.gitignore b/.gitignore index 297d1d3..658405c 100644 --- a/.gitignore +++ b/.gitignore @@ -23,8 +23,8 @@ build/ bin/ cache.json data/dump -data/sql/2-pages-inserts.sql -data/sql/3-internal-links-inserts.sql +data/sql/* +!data/sql/0000-tables-create.sql # debug npm-debug.log* diff --git a/TODO.md b/TODO.md index 112fe18..af11a92 100644 --- a/TODO.md +++ b/TODO.md @@ -12,7 +12,7 @@ - [x] `page.sql` (`pages` tables) - [x] `pagelinks.sql` (`internal_links` tables) - [ ] Import SQL files => Investigate why there is an error when importing - - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page` + - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.from_page_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page` - [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder - [ ] Documentation how to use + Last execution date - [ ] Rewrite bash script to download and extract SQL files from Wikipedia Database Dump to Node.js for better cross-platform support and easier maintenance + automation, preferably one Node.js script to generate everything to create the database diff --git a/data/README.md b/data/README.md index 39f8ec5..48daf4f 100644 --- a/data/README.md +++ b/data/README.md @@ -2,7 +2,7 @@ ```sh ./download-wikipedia-dump.sh -node --max-old-space-size=10096 database-wikipedia.js +node --max-old-space-size=8096 database-wikipedia.js ``` ## Utils @@ -13,6 +13,12 @@ Show the first 10 characters of sql file: `head -c 10 ./dump/page.sql` To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'` +To enter in the database container: `docker exec -it wikipedia-solver-database sh` + +Then: `mariadb --password="${DATABASE_PASSWORD}" --user="${DATABASE_USER}"` + +And `use wikipedia;`, for example: `SELECT * FROM pages LIMIT 10;` or to execute a SQL script: `source /docker-entrypoint-initdb.d/3-internal-links-inserts.sql;`. + ## Remove a volume ```sh diff --git a/data/database-wikipedia.js b/data/database-wikipedia.js index e33582d..712bfd9 100644 --- a/data/database-wikipedia.js +++ b/data/database-wikipedia.js @@ -1,9 +1,14 @@ import fs from "node:fs" import path from "node:path" -import { extractRowsFromSQLValues, swapKeysAndValues } from "./utils.js" +import { + extractRowsFromSQLValues, + swapKeysAndValues, + zeroPad, +} from "./utils.js" const SQL_DUMP_PATH = path.join(process.cwd(), "dump") const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql") +const SQL_FILENAME_NUMBER_PAD = 4 /** * @typedef {Record} WikipediaPagesKeyTitle @@ -110,7 +115,10 @@ const wikipediaPagesKeyId = await cleanPagesSQL() const cleanPagesSQLWriteToFile = async () => { console.log("cleanPagesSQLWriteToFile - Writing to file...") - const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") + const sqlOutputPath = path.join( + SQL_OUTPUT_PATH, + `${zeroPad(1, SQL_FILENAME_NUMBER_PAD)}-pages-inserts.sql`, + ) const INSERT_INTO_START_OUTPUT = "INSERT INTO pages (id, title) VALUES " const wikipediaPagesString = Object.entries(wikipediaPagesKeyId) @@ -129,12 +137,6 @@ const cleanPagesSQLWriteToFile = async () => { await cleanPagesSQLWriteToFile() -const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "3-internal-links-inserts.sql") -const INSERT_INTO_START_OUTPUT = - "INSERT INTO internal_links (from_page_id, to_page_id) VALUES " -const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w") -await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT, "utf-8") - /** * Function to clean the `pagelinks.sql` file by: * - Removing all lines that don't start with `INSERT INTO...`. @@ -143,6 +145,10 @@ await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT, "utf-8") * @returns {Promise} */ const cleanInternalLinksSQL = async () => { + let internalLinksFileCount = 2 + const INSERT_INTO_START_OUTPUT = + "INSERT INTO internal_links (from_page_id, to_page_id) VALUES " + /** * @type {WikipediaPagesKeyTitle} */ @@ -157,17 +163,27 @@ const cleanInternalLinksSQL = async () => { let current = "" let lastPercent = 0 - const BATCH_SIZE = 10_000 + const BATCH_SIZE = 4_000_000 /** * @type {string[]} */ let batch = [] - const flushBatch = async (isLast = false) => { + const flushBatch = async () => { if (batch.length > 0) { - const batchString = batch.join(",") + (isLast ? ";" : ",") - await sqlOutputFile.appendFile(batchString, "utf-8") + const batchString = batch.join(",") + const fileName = `${zeroPad(internalLinksFileCount, SQL_FILENAME_NUMBER_PAD)}-internal-links-inserts.sql` + const sqlOutputPath = path.join(SQL_OUTPUT_PATH, fileName) + await fs.promises.writeFile( + sqlOutputPath, + `${INSERT_INTO_START_OUTPUT}${batchString};`, + { + encoding: "utf-8", + }, + ) + console.log(`flushBatch - ${fileName}, batch.length: ${batch.length}`) + internalLinksFileCount += 1 batch = [] } } @@ -218,7 +234,7 @@ const cleanInternalLinksSQL = async () => { if (plFromNamespace === "0" && plTargetNamespace === "0") { const toPageId = wikipediaPagesKeyTitle[plTargetTitle] - if (toPageId != null) { + if (toPageId != null && wikipediaPagesKeyId[plFromPageId] != null) { /** * @type {WikipediaInternalLink} */ @@ -229,19 +245,21 @@ const cleanInternalLinksSQL = async () => { batch.push( `(${wikipediaInternalLink.fromPageId},${wikipediaInternalLink.toPageId})`, ) - - if (batch.length >= BATCH_SIZE) { - await flushBatch() - } } } } + + if (batch.length >= BATCH_SIZE) { + sqlInputFileStream.pause() + await flushBatch() + sqlInputFileStream.resume() + } }) .on("error", (error) => { return reject(error) }) .on("close", async () => { - await flushBatch(true) + await flushBatch() console.log("cleanInternalLinksSQL - Bytes read (100%).") return resolve() }) diff --git a/data/sql/1-tables-create.sql b/data/sql/0000-tables-create.sql similarity index 74% rename from data/sql/1-tables-create.sql rename to data/sql/0000-tables-create.sql index b1efa4a..6454d64 100644 --- a/data/sql/1-tables-create.sql +++ b/data/sql/0000-tables-create.sql @@ -11,12 +11,12 @@ CREATE TABLE `pages` ( -- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config. CREATE TABLE `internal_links` ( - `id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT, + -- `id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT, `from_page_id` INT(8) UNSIGNED NOT NULL, `to_page_id` INT(8) UNSIGNED NOT NULL, - PRIMARY KEY (`id`) - -- PRIMARY KEY (`from_page_id`, `to_page_id`), - -- FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE, - -- FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE + -- PRIMARY KEY (`id`) + PRIMARY KEY (`from_page_id`, `to_page_id`), + FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE, + FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE ) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; diff --git a/data/utils.js b/data/utils.js index 1da0ac7..cb5622d 100644 --- a/data/utils.js +++ b/data/utils.js @@ -74,3 +74,15 @@ export const swapKeysAndValues = (object) => { }), ) } + +/** + * + * @param {number} number + * @param {number} places + * @returns {string} + * @example zeroPad(1, 2) // '01' + * @example zeroPad(10, 2) // '10' + */ +export const zeroPad = (number, places = 2) => { + return number.toString().padStart(places, "0") +}