From fee0b4e6810903e0c25713d38731d30ad2bac723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20LUDWIG?= Date: Mon, 5 Aug 2024 14:04:28 +0200 Subject: [PATCH] feat: adapt internal_links SQL file POC --- .gitignore | 1 + TODO.md | 13 +- data/README.md | 73 +++++++- data/database-wikipedia.js | 163 ++++++++++++++++-- data/download-wikipedia-dump.sh | 11 +- ...1-pages-create.sql => 1-tables-create.sql} | 12 +- data/utils.js | 13 ++ 7 files changed, 257 insertions(+), 29 deletions(-) rename data/sql/{1-pages-create.sql => 1-tables-create.sql} (53%) diff --git a/.gitignore b/.gitignore index 62337a4..297d1d3 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ bin/ cache.json data/dump data/sql/2-pages-inserts.sql +data/sql/3-internal-links-inserts.sql # debug npm-debug.log* diff --git a/TODO.md b/TODO.md index 0dfcacc..fd3bd85 100644 --- a/TODO.md +++ b/TODO.md @@ -1,28 +1,31 @@ # TODO -- [x] chore: initial commit (+ mirror on GitHub) +- [x] chore: initial commit - [x] Deploy first staging version (v1.0.0-staging.1) - [x] Wikipedia Database Dump - [x] Download SQL files - [x] Extract SQL files - [x] Tables structure `CREATE TABLE` - [x] `page.sql` (`pages` tables) - - [ ] `pagelinks.sql` (`internal_links` tables) + - [x] `pagelinks.sql` (`internal_links` tables) - [x] Adapt downloaded SQL files - [x] `page.sql` (`pages` tables) - - [ ] `pagelinks.sql` (`internal_links` tables) + - [x] `pagelinks.sql` (`internal_links` tables) - [x] Import SQL files - - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page` + - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page` - [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder - [ ] Documentation how to use + Last execution date - [ ] Rewrite bash script to download and extract SQL files from Wikipedia Database Dump to Node.js for better cross-platform support and easier maintenance + automation, preferably one Node.js script to generate everything to create the database - [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version + - [ ] Update logic to create custom `internal_links` table to make it work with latest wikipedia dumps (notably concerning the change in `pagelinks.sql` where the title is not included anymore, but instead it uses `pl_target_id`, foreign key to `linktarget`), last tested dumb working `20240420` + - [ ] Handle redirects - [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/)) to get shortest paths between 2 pages - [ ] Implement Wikipedia Game Solver (`website`) with inputs, button to submit, and list all pages to go from one to another, or none if it is not possible -- [ ] Check how to deal with redirects (+ Wikipedia Database Dump related) - [ ] Implement toast notifications for errors, warnings, and success messages - [ ] Implement CLI (`cli`) - [ ] Add docs to add locale/edit translations, create component, install a dependency in a package, create a new package, technology used, architecture, links where it's deployed, how to use/install for end users, how to update dependencies with `npx taze -l` etc. +- [ ] GitHub Mirror +- [ ] Delete `TODO.md` file and instead use issue for the remaining tasks ## Links diff --git a/data/README.md b/data/README.md index 0f114e1..bb7cef7 100644 --- a/data/README.md +++ b/data/README.md @@ -1,5 +1,10 @@ # Wikipedia data +```sh +./download-wikipedia-dump.sh +node --max-old-space-size=10096 database-wikipedia.js +``` + ## Utils Show the first 10 line of sql file: `head -n 10 ./dump/page.sql` @@ -34,17 +39,25 @@ Import data.sql MySQL Docker Container: - - - -## `page.sql.gz` - -### MySQL full version +- Run SQL queries against Wikipedia: ```sql --- MariaDB dump 10.19 Distrib 10.5.23-MariaDB, for debian-linux-gnu (x86_64) +-- Get the sanitized title of a page linked in the page with title 'Node.js' +SELECT lt.lt_title FROM linktarget lt WHERE lt.lt_id = ( + SELECT pl.pl_target_id FROM pagelinks pl WHERE pl.pl_from = ( + SELECT p.page_id FROM page p WHERE p.page_title = 'Node.js' AND p.page_namespace = 0 + ) LIMIT 1 +); +``` + +## `page.sql.gz` - MySQL full version up until inserts + +```sql +-- MySQL dump 10.19 Distrib 10.3.38-MariaDB, for debian-linux-gnu (x86_64) -- -- Host: db1206 Database: enwiki -- ------------------------------------------------------ --- Server version 10.6.17-MariaDB-log +-- Server version 10.4.26-MariaDB-log /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; @@ -82,10 +95,56 @@ CREATE TABLE `page` ( KEY `page_random` (`page_random`), KEY `page_len` (`page_len`), KEY `page_redirect_namespace_len` (`page_is_redirect`,`page_namespace`,`page_len`) -) ENGINE=InnoDB AUTO_INCREMENT=77490241 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; +) ENGINE=InnoDB AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; /*!40101 SET character_set_client = @saved_cs_client */; -- -- Dumping data for table `page` -- ``` + +## `pagelinks.sql.gz` - MySQL full version up until inserts + +```sql +-- MySQL dump 10.19 Distrib 10.3.38-MariaDB, for debian-linux-gnu (x86_64) +-- +-- Host: db1206 Database: enwiki +-- ------------------------------------------------------ +-- Server version 10.4.26-MariaDB-log + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!40101 SET NAMES utf8mb4 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Table structure for table `pagelinks` +-- + +DROP TABLE IF EXISTS `pagelinks`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `pagelinks` ( + `pl_from` int(8) unsigned NOT NULL DEFAULT 0, + `pl_namespace` int(11) NOT NULL DEFAULT 0, + `pl_title` varbinary(255) NOT NULL DEFAULT '', + `pl_from_namespace` int(11) NOT NULL DEFAULT 0, + `pl_target_id` bigint(20) unsigned DEFAULT NULL, + PRIMARY KEY (`pl_from`,`pl_namespace`,`pl_title`), + KEY `pl_namespace` (`pl_namespace`,`pl_title`,`pl_from`), + KEY `pl_backlinks_namespace` (`pl_from_namespace`,`pl_namespace`,`pl_title`,`pl_from`), + KEY `pl_target_id` (`pl_target_id`,`pl_from`), + KEY `pl_backlinks_namespace_target_id` (`pl_from_namespace`,`pl_target_id`,`pl_from`) +) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `pagelinks` +-- +``` diff --git a/data/database-wikipedia.js b/data/database-wikipedia.js index c4fddb1..3d542e8 100644 --- a/data/database-wikipedia.js +++ b/data/database-wikipedia.js @@ -1,28 +1,42 @@ import fs from "node:fs" import path from "node:path" -import { extractRowsFromSQLValues } from "./utils.js" +import { extractRowsFromSQLValues, swapKeysAndValues } from "./utils.js" const SQL_DUMP_PATH = path.join(process.cwd(), "dump") const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql") /** - * @typedef {Record} WikipediaPages + * @typedef {Record} WikipediaPagesKeyTitle * * Object to store pages from Wikipedia: * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ). * - Value: page id. */ +/** + * @typedef {Record} WikipediaPagesKeyId + * + * Object to store pages from Wikipedia: + * - Key: page id. + * - Value: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ). + */ + +/** + * @typedef WikipediaInternalLink + * @property {number} fromPageId + * @property {number} toPageId + */ + /** * Function to clean the `page.sql` file by: * - Removing all lines that don't start with `INSERT INTO...`. * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0). * - Only keep columns `page_id` (1st column) and `page_title` (3rd column). - * @returns {Promise} + * @returns {Promise} */ const cleanPagesSQL = async () => { - /** @type {WikipediaPages} */ - const wikipediaPages = {} + /** @type {WikipediaPagesKeyId} */ + const wikipediaPagesKeyId = {} const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES " const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql") @@ -41,7 +55,7 @@ const cleanPagesSQL = async () => { if (bytesReadPercent - lastPercent >= 1) { console.log( - `Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, + `cleanPagesSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, ) lastPercent = bytesReadPercent } @@ -78,7 +92,7 @@ const cleanPagesSQL = async () => { const isRedirect = row[3] === "1" if (namespace === "0" && !isRedirect) { - wikipediaPages[title] = id + wikipediaPagesKeyId[id] = title } } }) @@ -86,19 +100,21 @@ const cleanPagesSQL = async () => { return reject(error) }) .on("close", () => { - return resolve(wikipediaPages) + console.log("cleanPagesSQL - Bytes read (100%).") + return resolve(wikipediaPagesKeyId) }) }) } -const wikipediaPages = await cleanPagesSQL() +const wikipediaPagesKeyId = await cleanPagesSQL() const cleanPagesSQLWriteToFile = async () => { + console.log("cleanPagesSQLWriteToFile - Writing to file...") const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES " - const wikipediaPagesString = Object.entries(wikipediaPages) - .map(([title, id]) => { + const wikipediaPagesString = Object.entries(wikipediaPagesKeyId) + .map(([id, title]) => { return `(${id},${title})` }) .join(",") @@ -108,6 +124,131 @@ const cleanPagesSQLWriteToFile = async () => { `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`, { encoding: "utf-8" }, ) + console.log("cleanPagesSQLWriteToFile - Done.") } await cleanPagesSQLWriteToFile() + +/** + * Function to clean the `pagelinks.sql` file by: + * - Removing all lines that don't start with `INSERT INTO...`. + * - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0. + * - Transform the rows to internal links with fromPageId and toPageId. + * @returns {Promise} + */ +const cleanInternalLinksSQL = async () => { + /** + * @type {WikipediaInternalLink[]} + */ + const internalLinks = [] + + /** + * @type {WikipediaPagesKeyTitle} + */ + const wikipediaPagesKeyTitle = swapKeysAndValues(wikipediaPagesKeyId) + + const INSERT_INTO_START_INPUT = "INSERT INTO `pagelinks` VALUES " + const sqlInputPath = path.join(SQL_DUMP_PATH, "pagelinks.sql") + const sqlInputStat = await fs.promises.stat(sqlInputPath) + const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8") + + let isInsideInsert = false + let current = "" + let lastPercent = 0 + + return await new Promise((resolve, reject) => { + sqlInputFileStream + .on("data", (dataInput) => { + const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size + const bytesReadPercent = bytesReadRatio * 100 + + if (bytesReadPercent - lastPercent >= 1) { + console.log( + `cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, + ) + lastPercent = bytesReadPercent + } + + let data = current + dataInput + + if (!isInsideInsert) { + const lines = data.split("\n").filter((line) => { + return line.startsWith(INSERT_INTO_START_INPUT) + }) + const [line] = lines + if (line == null) { + sqlInputFileStream.close() + return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`)) + } + isInsideInsert = true + const lineStripped = line.slice(INSERT_INTO_START_INPUT.length) + data = lineStripped + } + + const { rows, unCompleted } = extractRowsFromSQLValues(data) + current = unCompleted + + for (const row of rows) { + if (row.length !== 5) { + sqlInputFileStream.close() + console.error([row]) + return reject(new Error(`Invalid Row values.`)) + } + + const plFromPageId = Number.parseInt(row[0] ?? "0", 10) + const plTargetNamespace = row[1] ?? "" + const plTargetTitle = row[2] ?? "" + const plFromNamespace = row[3] ?? "" + + if (plFromNamespace === "0" && plTargetNamespace === "0") { + if ( + wikipediaPagesKeyTitle[plTargetTitle] != null && + wikipediaPagesKeyId[plFromPageId] != null + ) { + /** + * @type {WikipediaInternalLink} + */ + const wikipediaInternalLink = { + fromPageId: plFromPageId, + toPageId: wikipediaPagesKeyTitle[plTargetTitle], + } + internalLinks.push(wikipediaInternalLink) + } + } + } + }) + .on("error", (error) => { + return reject(error) + }) + .on("close", () => { + console.log("cleanInternalLinksSQL - Bytes read (100%).") + return resolve(internalLinks) + }) + }) +} + +const internalLinks = await cleanInternalLinksSQL() + +const cleanInternalLinksSQLWriteToFile = async () => { + console.log("cleanInternalLinksSQLWriteToFile - Writing to file...") + const sqlOutputPath = path.join( + SQL_OUTPUT_PATH, + "3-internal-links-inserts.sql", + ) + const INSERT_INTO_START_OUTPUT = "INSERT INTO internal_links VALUES " + + const wikipediaPagesString = internalLinks + .map(({ fromPageId, toPageId }) => { + return `(${fromPageId},${toPageId})` + }) + .join(",") + + await fs.promises.writeFile( + sqlOutputPath, + `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`, + { encoding: "utf-8" }, + ) + console.log("cleanInternalLinksSQLWriteToFile - Done.") +} + +await cleanInternalLinksSQLWriteToFile() diff --git a/data/download-wikipedia-dump.sh b/data/download-wikipedia-dump.sh index 1fa1887..43aa861 100755 --- a/data/download-wikipedia-dump.sh +++ b/data/download-wikipedia-dump.sh @@ -8,7 +8,8 @@ set -o nounset set -o pipefail DUMP_DIRECTORY="dump" -DOWNLOAD_DATE="latest" +DOWNLOAD_DATE="20240420" +# DOWNLOAD_DATE="latest" WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-" mkdir --parents "${DUMP_DIRECTORY}" @@ -18,16 +19,16 @@ download_file() { local file_path_output="${DUMP_DIRECTORY}/${filename}" local file_url="${WIKIPEDIA_DUMP_URL}${filename}" - if [[ ! -f "${file_path_output}" ]]; then + if [[ ! -f "${file_path_output%.gz}" ]]; then echo "Downloading \"${filename}\" from \"${file_url}\"..." wget --output-document="${file_path_output}" "${file_url}" else - echo "File \"${filename}\" from \"${file_url}\" already exists." + echo "File \"${filename%.gz}\" from \"${file_url}\" already exists." fi } -download_file "page.sql.gz" -download_file "pagelinks.sql.gz" +# download_file "page.sql.gz" +# download_file "pagelinks.sql.gz" extract_file() { local filename="${1}" diff --git a/data/sql/1-pages-create.sql b/data/sql/1-tables-create.sql similarity index 53% rename from data/sql/1-pages-create.sql rename to data/sql/1-tables-create.sql index 0bedf96..faaddbc 100644 --- a/data/sql/1-pages-create.sql +++ b/data/sql/1-tables-create.sql @@ -1,9 +1,19 @@ CREATE TABLE `pages` ( `id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT, `title` VARBINARY(255) NOT NULL DEFAULT '', + PRIMARY KEY (`id`), UNIQUE KEY (`title`) -) ENGINE=InnoDB AUTO_INCREMENT=77490241 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; +) ENGINE=InnoDB AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; -- VARBINARY usage instead of VARCHAR explanation: -- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config. + +CREATE TABLE `internal_links` ( + `from_page_id` INT(8) UNSIGNED NOT NULL, + `to_page_id` INT(8) UNSIGNED NOT NULL, + + PRIMARY KEY (`from_page_id`, `to_page_id`), + FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE, + FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; diff --git a/data/utils.js b/data/utils.js index 48f77bf..1da0ac7 100644 --- a/data/utils.js +++ b/data/utils.js @@ -61,3 +61,16 @@ export const extractRowsFromSQLValues = (input) => { return { rows, unCompleted } } + +/** + * Swaps the keys and values of an object. + * @param {*} object + * @returns + */ +export const swapKeysAndValues = (object) => { + return Object.fromEntries( + Object.entries(object).map(([key, value]) => { + return [value, key] + }), + ) +}