feat: adapt internal_links SQL file POC

2024-08-05 14:04:28 +02:00 · 2024-08-05 14:04:28 +02:00 · fee0b4e681
commit fee0b4e681
parent 61914d2392
7 changed files with 257 additions and 29 deletions
--- a/.gitignore
+++ b/.gitignore
@ -24,6 +24,7 @@ bin/
 cache.json
 data/dump
 data/sql/2-pages-inserts.sql
+data/sql/3-internal-links-inserts.sql

 # debug
 npm-debug.log*
--- a/TODO.md
+++ b/TODO.md
@ -1,28 +1,31 @@
 # TODO

- [x] chore: initial commit (+ mirror on GitHub)
+- [x] chore: initial commit
 - [x] Deploy first staging version (v1.0.0-staging.1)
 - [x] Wikipedia Database Dump
  - [x] Download SQL files
  - [x] Extract SQL files
  - [x] Tables structure `CREATE TABLE`
    - [x] `page.sql` (`pages` tables)
-    - [ ] `pagelinks.sql` (`internal_links` tables)
+    - [x] `pagelinks.sql` (`internal_links` tables)
  - [x] Adapt downloaded SQL files
    - [x] `page.sql` (`pages` tables)
-    - [ ] `pagelinks.sql` (`internal_links` tables)
+    - [x] `pagelinks.sql` (`internal_links` tables)
  - [x] Import SQL files
-  - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page`
+  - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages p WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page`
  - [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder
  - [ ] Documentation how to use + Last execution date
  - [ ] Rewrite bash script to download and extract SQL files from Wikipedia Database Dump to Node.js for better cross-platform support and easier maintenance + automation, preferably one Node.js script to generate everything to create the database
  - [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version
+  - [ ] Update logic to create custom `internal_links` table to make it work with latest wikipedia dumps (notably concerning the change in `pagelinks.sql` where the title is not included anymore, but instead it uses `pl_target_id`, foreign key to `linktarget`), last tested dumb working `20240420`
+  - [ ] Handle redirects
 - [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/)) to get shortest paths between 2 pages
 - [ ] Implement Wikipedia Game Solver (`website`) with inputs, button to submit, and list all pages to go from one to another, or none if it is not possible
- [ ] Check how to deal with redirects (+ Wikipedia Database Dump related)
 - [ ] Implement toast notifications for errors, warnings, and success messages
 - [ ] Implement CLI (`cli`)
 - [ ] Add docs to add locale/edit translations, create component, install a dependency in a package, create a new package, technology used, architecture, links where it's deployed, how to use/install for end users, how to update dependencies with `npx taze -l` etc.
+- [ ] GitHub Mirror
+- [ ] Delete `TODO.md` file and instead use issue for the remaining tasks

 ## Links

--- a/data/README.md
+++ b/data/README.md
@ -1,5 +1,10 @@
 # Wikipedia data

+```sh
+./download-wikipedia-dump.sh
+node --max-old-space-size=10096 database-wikipedia.js
+```
+
 ## Utils

 Show the first 10 line of sql file: `head -n 10 ./dump/page.sql`
@ -34,17 +39,25 @@ Import data.sql MySQL Docker Container: <https://stackoverflow.com/questions/438
 - Database layout: <https://www.mediawiki.org/wiki/Manual:Database_layout>
 - <https://en.wikipedia.org/wiki/Wikipedia:Database_download>
 - <https://dumps.wikimedia.org/enwiki/>
-
-## `page.sql.gz`
-
-### MySQL full version
+- Run SQL queries against Wikipedia: <https://quarry.wmcloud.org/>

 ```sql
-- MariaDB dump 10.19  Distrib 10.5.23-MariaDB, for debian-linux-gnu (x86_64)
+-- Get the sanitized title of a page linked in the page with title 'Node.js'
+SELECT lt.lt_title FROM linktarget lt WHERE lt.lt_id = (
+  SELECT pl.pl_target_id FROM pagelinks pl WHERE pl.pl_from = (
+    SELECT p.page_id FROM page p WHERE p.page_title = 'Node.js' AND p.page_namespace = 0
+  ) LIMIT 1
+);
+```
+
+## `page.sql.gz` - MySQL full version up until inserts
+
+```sql
+-- MySQL dump 10.19  Distrib 10.3.38-MariaDB, for debian-linux-gnu (x86_64)
 --
 -- Host: db1206    Database: enwiki
 -- ------------------------------------------------------
-- Server version	10.6.17-MariaDB-log
+-- Server version	10.4.26-MariaDB-log

 /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
@ -82,10 +95,56 @@ CREATE TABLE `page` (
  KEY `page_random` (`page_random`),
  KEY `page_len` (`page_len`),
  KEY `page_redirect_namespace_len` (`page_is_redirect`,`page_namespace`,`page_len`)
-) ENGINE=InnoDB AUTO_INCREMENT=77490241 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
+) ENGINE=InnoDB AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
 /*!40101 SET character_set_client = @saved_cs_client */;

 --
 -- Dumping data for table `page`
 --
 ```
+
+## `pagelinks.sql.gz` - MySQL full version up until inserts
+
+```sql
+-- MySQL dump 10.19  Distrib 10.3.38-MariaDB, for debian-linux-gnu (x86_64)
+--
+-- Host: db1206    Database: enwiki
+-- ------------------------------------------------------
+-- Server version	10.4.26-MariaDB-log
+
+/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
+/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
+/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
+/*!40101 SET NAMES utf8mb4 */;
+/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
+/*!40103 SET TIME_ZONE='+00:00' */;
+/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
+/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
+/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
+/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
+
+--
+-- Table structure for table `pagelinks`
+--
+
+DROP TABLE IF EXISTS `pagelinks`;
+/*!40101 SET @saved_cs_client     = @@character_set_client */;
+/*!40101 SET character_set_client = utf8 */;
+CREATE TABLE `pagelinks` (
+  `pl_from` int(8) unsigned NOT NULL DEFAULT 0,
+  `pl_namespace` int(11) NOT NULL DEFAULT 0,
+  `pl_title` varbinary(255) NOT NULL DEFAULT '',
+  `pl_from_namespace` int(11) NOT NULL DEFAULT 0,
+  `pl_target_id` bigint(20) unsigned DEFAULT NULL,
+  PRIMARY KEY (`pl_from`,`pl_namespace`,`pl_title`),
+  KEY `pl_namespace` (`pl_namespace`,`pl_title`,`pl_from`),
+  KEY `pl_backlinks_namespace` (`pl_from_namespace`,`pl_namespace`,`pl_title`,`pl_from`),
+  KEY `pl_target_id` (`pl_target_id`,`pl_from`),
+  KEY `pl_backlinks_namespace_target_id` (`pl_from_namespace`,`pl_target_id`,`pl_from`)
+) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Dumping data for table `pagelinks`
+--
+```
--- a/data/database-wikipedia.js
+++ b/data/database-wikipedia.js
@ -1,28 +1,42 @@
 import fs from "node:fs"
 import path from "node:path"
-import { extractRowsFromSQLValues } from "./utils.js"
+import { extractRowsFromSQLValues, swapKeysAndValues } from "./utils.js"

 const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
 const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")

 /**
- * @typedef {Record<string, number>} WikipediaPages
+ * @typedef {Record<string, number>} WikipediaPagesKeyTitle
 *
 * Object to store pages from Wikipedia:
 * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
 * - Value: page id.
 */

+/**
+ * @typedef {Record<string, number>} WikipediaPagesKeyId
+ *
+ * Object to store pages from Wikipedia:
+ * - Key: page id.
+ * - Value: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
+ */
+
+/**
+ * @typedef WikipediaInternalLink
+ * @property {number} fromPageId
+ * @property {number} toPageId
+ */
+
 /**
 * Function to clean the `page.sql` file by:
 * - Removing all lines that don't start with `INSERT INTO...`.
 * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
 * - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
- * @returns {Promise<WikipediaPages>}
+ * @returns {Promise<WikipediaPagesKeyId>}
 */
 const cleanPagesSQL = async () => {
-  /** @type {WikipediaPages} */
-  const wikipediaPages = {}
+  /** @type {WikipediaPagesKeyId} */
+  const wikipediaPagesKeyId = {}

  const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
  const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
@ -41,7 +55,7 @@ const cleanPagesSQL = async () => {

        if (bytesReadPercent - lastPercent >= 1) {
          console.log(
-            `Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
+            `cleanPagesSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
          )
          lastPercent = bytesReadPercent
        }
@ -78,7 +92,7 @@ const cleanPagesSQL = async () => {
          const isRedirect = row[3] === "1"

          if (namespace === "0" && !isRedirect) {
-            wikipediaPages[title] = id
+            wikipediaPagesKeyId[id] = title
          }
        }
      })
@ -86,19 +100,21 @@ const cleanPagesSQL = async () => {
        return reject(error)
      })
      .on("close", () => {
-        return resolve(wikipediaPages)
+        console.log("cleanPagesSQL - Bytes read (100%).")
+        return resolve(wikipediaPagesKeyId)
      })
  })
 }

-const wikipediaPages = await cleanPagesSQL()
+const wikipediaPagesKeyId = await cleanPagesSQL()

 const cleanPagesSQLWriteToFile = async () => {
+  console.log("cleanPagesSQLWriteToFile - Writing to file...")
  const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
  const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "

-  const wikipediaPagesString = Object.entries(wikipediaPages)
-    .map(([title, id]) => {
+  const wikipediaPagesString = Object.entries(wikipediaPagesKeyId)
+    .map(([id, title]) => {
      return `(${id},${title})`
    })
    .join(",")
@ -108,6 +124,131 @@ const cleanPagesSQLWriteToFile = async () => {
    `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
    { encoding: "utf-8" },
  )
+  console.log("cleanPagesSQLWriteToFile - Done.")
 }

 await cleanPagesSQLWriteToFile()
+
+/**
+ * Function to clean the `pagelinks.sql` file by:
+ * - Removing all lines that don't start with `INSERT INTO...`.
+ * - Filter by keeping rows where `pl_from_namespace` (2nd column) is equal to 0.
+ * - Transform the rows to internal links with fromPageId and toPageId.
+ * @returns {Promise<WikipediaInternalLink[]>}
+ */
+const cleanInternalLinksSQL = async () => {
+  /**
+   * @type {WikipediaInternalLink[]}
+   */
+  const internalLinks = []
+
+  /**
+   * @type {WikipediaPagesKeyTitle}
+   */
+  const wikipediaPagesKeyTitle = swapKeysAndValues(wikipediaPagesKeyId)
+
+  const INSERT_INTO_START_INPUT = "INSERT INTO `pagelinks` VALUES "
+  const sqlInputPath = path.join(SQL_DUMP_PATH, "pagelinks.sql")
+  const sqlInputStat = await fs.promises.stat(sqlInputPath)
+  const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
+
+  let isInsideInsert = false
+  let current = ""
+  let lastPercent = 0
+
+  return await new Promise((resolve, reject) => {
+    sqlInputFileStream
+      .on("data", (dataInput) => {
+        const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
+        const bytesReadPercent = bytesReadRatio * 100
+
+        if (bytesReadPercent - lastPercent >= 1) {
+          console.log(
+            `cleanInternalLinksSQL - Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
+          )
+          lastPercent = bytesReadPercent
+        }
+
+        let data = current + dataInput
+
+        if (!isInsideInsert) {
+          const lines = data.split("\n").filter((line) => {
+            return line.startsWith(INSERT_INTO_START_INPUT)
+          })
+          const [line] = lines
+          if (line == null) {
+            sqlInputFileStream.close()
+            return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
+          }
+          isInsideInsert = true
+          const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
+          data = lineStripped
+        }
+
+        const { rows, unCompleted } = extractRowsFromSQLValues(data)
+        current = unCompleted
+
+        for (const row of rows) {
+          if (row.length !== 5) {
+            sqlInputFileStream.close()
+            console.error([row])
+            return reject(new Error(`Invalid Row values.`))
+          }
+
+          const plFromPageId = Number.parseInt(row[0] ?? "0", 10)
+          const plTargetNamespace = row[1] ?? ""
+          const plTargetTitle = row[2] ?? ""
+          const plFromNamespace = row[3] ?? ""
+
+          if (plFromNamespace === "0" && plTargetNamespace === "0") {
+            if (
+              wikipediaPagesKeyTitle[plTargetTitle] != null &&
+              wikipediaPagesKeyId[plFromPageId] != null
+            ) {
+              /**
+               * @type {WikipediaInternalLink}
+               */
+              const wikipediaInternalLink = {
+                fromPageId: plFromPageId,
+                toPageId: wikipediaPagesKeyTitle[plTargetTitle],
+              }
+              internalLinks.push(wikipediaInternalLink)
+            }
+          }
+        }
+      })
+      .on("error", (error) => {
+        return reject(error)
+      })
+      .on("close", () => {
+        console.log("cleanInternalLinksSQL - Bytes read (100%).")
+        return resolve(internalLinks)
+      })
+  })
+}
+
+const internalLinks = await cleanInternalLinksSQL()
+
+const cleanInternalLinksSQLWriteToFile = async () => {
+  console.log("cleanInternalLinksSQLWriteToFile - Writing to file...")
+  const sqlOutputPath = path.join(
+    SQL_OUTPUT_PATH,
+    "3-internal-links-inserts.sql",
+  )
+  const INSERT_INTO_START_OUTPUT = "INSERT INTO internal_links VALUES "
+
+  const wikipediaPagesString = internalLinks
+    .map(({ fromPageId, toPageId }) => {
+      return `(${fromPageId},${toPageId})`
+    })
+    .join(",")
+
+  await fs.promises.writeFile(
+    sqlOutputPath,
+    `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
+    { encoding: "utf-8" },
+  )
+  console.log("cleanInternalLinksSQLWriteToFile - Done.")
+}
+
+await cleanInternalLinksSQLWriteToFile()
--- a/data/download-wikipedia-dump.sh
+++ b/data/download-wikipedia-dump.sh
@ -8,7 +8,8 @@ set -o nounset
 set -o pipefail

 DUMP_DIRECTORY="dump"
-DOWNLOAD_DATE="latest"
+DOWNLOAD_DATE="20240420"
+# DOWNLOAD_DATE="latest"
 WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"

 mkdir --parents "${DUMP_DIRECTORY}"
@ -18,16 +19,16 @@ download_file() {
  local file_path_output="${DUMP_DIRECTORY}/${filename}"
  local file_url="${WIKIPEDIA_DUMP_URL}${filename}"

-  if [[ ! -f "${file_path_output}" ]]; then
+  if [[ ! -f "${file_path_output%.gz}" ]]; then
    echo "Downloading \"${filename}\" from \"${file_url}\"..."
    wget --output-document="${file_path_output}" "${file_url}"
  else
-    echo "File \"${filename}\" from \"${file_url}\" already exists."
+    echo "File \"${filename%.gz}\" from \"${file_url}\" already exists."
  fi
 }

-download_file "page.sql.gz"
-download_file "pagelinks.sql.gz"
+# download_file "page.sql.gz"
+# download_file "pagelinks.sql.gz"

 extract_file() {
  local filename="${1}"
--- a/data/sql/1-tables-create.sql
+++ b/data/sql/1-tables-create.sql
@ -1,9 +1,19 @@
 CREATE TABLE `pages` (
  `id` INT(8) UNSIGNED NOT NULL AUTO_INCREMENT,
  `title` VARBINARY(255) NOT NULL DEFAULT '',
+
  PRIMARY KEY (`id`),
  UNIQUE KEY (`title`)
-) ENGINE=InnoDB AUTO_INCREMENT=77490241 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
+) ENGINE=InnoDB AUTO_INCREMENT=76684425 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;

 -- VARBINARY usage instead of VARCHAR explanation: <https://stackoverflow.com/a/13397437>
 -- > War on varchar. Changed all occurrences of varchar(N) and varchar(N) binary to varbinary(N). varchars cause problems ("Invalid mix of collations" errors) on MySQL databases with certain configs, most notably the default MySQL config.
+
+CREATE TABLE `internal_links` (
+  `from_page_id` INT(8) UNSIGNED NOT NULL,
+  `to_page_id` INT(8) UNSIGNED NOT NULL,
+
+  PRIMARY KEY (`from_page_id`, `to_page_id`),
+  FOREIGN KEY (`from_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE,
+  FOREIGN KEY (`to_page_id`) REFERENCES `pages` (`id`) ON DELETE CASCADE
+) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;
--- a/data/utils.js
+++ b/data/utils.js
@ -61,3 +61,16 @@ export const extractRowsFromSQLValues = (input) => {

  return { rows, unCompleted }
 }
+
+/**
+ * Swaps the keys and values of an object.
+ * @param {*} object
+ * @returns
+ */
+export const swapKeysAndValues = (object) => {
+  return Object.fromEntries(
+    Object.entries(object).map(([key, value]) => {
+      return [value, key]
+    }),
+  )
+}