chore: clean up POC to get Wikipedia dump

2024-08-05 00:52:48 +02:00 · 2024-08-05 00:52:48 +02:00 · 61914d2392
commit 61914d2392
parent 3de838dded
10 changed files with 97 additions and 345 deletions
--- a/TODO.md
+++ b/TODO.md
@ -12,11 +12,12 @@
    - [x] `page.sql` (`pages` tables)
    - [ ] `pagelinks.sql` (`internal_links` tables)
  - [x] Import SQL files
-  - [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version
  - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page`
  - [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder
-  - [ ] `.gitignore` correctly + Documentation how to use + Last execution date
- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/))
+  - [ ] Documentation how to use + Last execution date
+  - [ ] Rewrite bash script to download and extract SQL files from Wikipedia Database Dump to Node.js for better cross-platform support and easier maintenance + automation, preferably one Node.js script to generate everything to create the database
+  - [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version
+- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/)) to get shortest paths between 2 pages
 - [ ] Implement Wikipedia Game Solver (`website`) with inputs, button to submit, and list all pages to go from one to another, or none if it is not possible
 - [ ] Check how to deal with redirects (+ Wikipedia Database Dump related)
 - [ ] Implement toast notifications for errors, warnings, and success messages
--- a/data/README.md
+++ b/data/README.md
@ -4,27 +4,19 @@

 Show the first 10 line of sql file: `head -n 10 ./dump/page.sql`

-To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-postgres-data'`
+To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'`

-## PostgreSQL related
-
-### Import SQL file to PostgreSQL Docker Container
-
-In `compose.yaml`, we can specify SQL scripts to be executed when the container starts for the first time.
-
-```yaml
-volumes:
-  - "./sql:/docker-entrypoint-initdb.d/"
-```
-
-### Remove a volume
+## Remove a volume

 ```sh
 # List all volumes
 docker volume ls

 # Remove a volume
-docker volume rm data_wikipedia-solver-postgres-data
+docker volume rm data_wikipedia-solver-mariadb-data
+
+# Or by using docker compose down
+docker-compose down --volumes
 ```

 ## MySQL Related
@ -96,32 +88,4 @@ CREATE TABLE `page` (
 --
 -- Dumping data for table `page`
 --
-
-/*!40000 ALTER TABLE `page` DISABLE KEYS */;
-INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL);
-
-INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL),(12,0,'Anarchism',0,0,0.786172332974311,'20240731234111','20240731234202',1234495258,110759,'wikitext',NULL),(12281,0,'Gottfried_Wilhelm_Leibniz',0,0,0.79151204115852,'20240731234133','20240731234228',1237687724,155319,'wikitext',NULL),(13,0,'AfghanistanHistory',1,0,0.154661929211,'20240729123940','20240722220436',783865149,90,'wikitext',NULL),(14,0,'AfghanistanGeography',1,0,0.952234464653055,'20240722211426','20240722220436',783865160,92,'wikitext',NULL),(15,0,'AfghanistanPeople',1,0,0.047716566551,'20240722211426','20240722220436',783865293,95,'wikitext',NULL),(12473,1,'Gnosticism',0,0,0.00653186720472934,'20240801075011','20240731232236',1233717868,6579,'wikitext',NULL);
-- Expected output: INSERT INTO `page` VALUES (12,'Anarchism'),(12281,'Gottfried_Wilhelm_Leibniz');
-```
-
-### PostgreSQL short version
-
-```sql
-CREATE TABLE IF NOT EXISTS pages (
-  id BIGSERIAL PRIMARY KEY,
-  title VARCHAR(255) UNIQUE NOT NULL
-
-  -- is_redirect BOOLEAN NOT NULL DEFAULT FALSE
-);
-
-- Examples of inserts
-INSERT INTO pages VALUES (10, 'AccessibleComputing'); -- (is_redirect = true)
-INSERT INTO pages VALUES (10474, 'Eight_queens_puzzle'); -- (is_redirect = false)
-
-INSERT INTO pages VALUES
-(10,'AccessibleComputing'),
-(12,'Anarchism'),
-(13,'AfghanistanHistory'),
-(14,'AfghanistanGeography'),
-(15,'AfghanistanPeople');
 ```
--- a/data/compose.yaml
+++ b/data/compose.yaml
@ -1,17 +1,4 @@
 services:
-  # wikipedia-solver-database:
-  #   container_name: "wikipedia-solver-database"
-  #   image: "postgres:16.3"
-  #   restart: "unless-stopped"
-  #   env_file: ".env"
-  #   environment:
-  #     POSTGRES_USER: ${DATABASE_USER}
-  #     POSTGRES_PASSWORD: ${DATABASE_PASSWORD}
-  #     POSTGRES_DB: ${DATABASE_NAME}
-  #   volumes:
-  #     - "wikipedia-solver-postgres-data:/var/lib/postgresql/data"
-  #     - "./sql:/docker-entrypoint-initdb.d/"
-
  wikipedia-solver-database:
    container_name: "wikipedia-solver-database"
    image: "mariadb:10.6.17"
@ -47,25 +34,5 @@ services:
      - "./adminer/logo.png:/var/www/html/logo.png"
      - "./adminer/fonts/:/var/www/html/fonts"

-  # dbgate:
-  #     image: "dbgate/dbgate:5.3.3"
-  #     restart: "always"
-  #     ports:
-  #       - "8080:3000"
-  #     volumes:
-  #       - "dbgate-data:/root/.dbgate"
-  #     environment:
-  #       CONNECTIONS: "con1"
-
-  #       LABEL_con1: "Postgres"
-  #       SERVER_con1: "wikipedia-solver-database"
-  #       USER_con1: ${DATABASE_USER}
-  #       PASSWORD_con1: ${DATABASE_PASSWORD}
-  #       PORT_con1: 5432
-  #       ENGINE_con1: "postgres@dbgate-plugin-postgres"
-
 volumes:
  wikipedia-solver-mariadb-data:
-  # wikipedia-solver-postgres-data:
-  # dbgate-data:
-  #   driver: "local"
--- a/data/database-wikipedia-v2.js
+++ b/data/database-wikipedia-v2.js
@ -1,113 +0,0 @@
-import fs from "node:fs"
-import path from "node:path"
-import { extractRowsFromSQLValues } from "./utils.js"
-
-const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
-const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
-
-/**
- * @typedef {Record<string, number>} WikipediaPages
- *
- * Object to store pages from Wikipedia:
- * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
- * - Value: page id.
- */
-
-/**
- * Function to clean the `page.sql` file by:
- * - Removing all lines that don't start with `INSERT INTO...`.
- * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
- * - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
- * @returns {Promise<WikipediaPages>}
- */
-const cleanPagesSQL = async () => {
-  /** @type {WikipediaPages} */
-  const wikipediaPages = {}
-
-  const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
-  const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
-  const sqlInputStat = await fs.promises.stat(sqlInputPath)
-  const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
-
-  let isInsideInsert = false
-  let current = ""
-  let lastPercent = 0
-
-  return await new Promise((resolve, reject) => {
-    sqlInputFileStream
-      .on("data", (dataInput) => {
-        const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
-        const bytesReadPercent = bytesReadRatio * 100
-
-        if (bytesReadPercent - lastPercent >= 1) {
-          console.log(
-            `Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
-          )
-          lastPercent = bytesReadPercent
-        }
-
-        let data = current + dataInput
-
-        if (!isInsideInsert) {
-          const lines = data.split("\n").filter((line) => {
-            return line.startsWith(INSERT_INTO_START_INPUT)
-          })
-          const [line] = lines
-          if (line == null) {
-            sqlInputFileStream.close()
-            return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
-          }
-          isInsideInsert = true
-          const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
-          data = lineStripped
-        }
-
-        const { rows, unCompleted } = extractRowsFromSQLValues(data)
-        current = unCompleted
-
-        for (const row of rows) {
-          if (row.length !== 12) {
-            sqlInputFileStream.close()
-            console.error([row])
-            return reject(new Error(`Invalid Row values.`))
-          }
-
-          const id = Number.parseInt(row[0] ?? "0", 10)
-          const namespace = row[1] ?? ""
-          const title = row[2] ?? ""
-          const isRedirect = row[3] === "1"
-
-          if (namespace === "0" && !isRedirect) {
-            wikipediaPages[title] = id
-          }
-        }
-      })
-      .on("error", (error) => {
-        return reject(error)
-      })
-      .on("close", () => {
-        return resolve(wikipediaPages)
-      })
-  })
-}
-
-const wikipediaPages = await cleanPagesSQL()
-
-const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
-const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
-
-const wikipediaPagesString = Object.entries(wikipediaPages)
-  .map(([title, id]) => {
-    return `(${id},${title})`
-  })
-  .join(",")
-
-await fs.promises.writeFile(
-  sqlOutputPath,
-  `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
-  { encoding: "utf-8" },
-)
-
-// const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
-// await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT)
-// await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8")
--- a/data/database-wikipedia.js
+++ b/data/database-wikipedia.js
@ -5,22 +5,29 @@ import { extractRowsFromSQLValues } from "./utils.js"
 const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
 const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")

+/**
+ * @typedef {Record<string, number>} WikipediaPages
+ *
+ * Object to store pages from Wikipedia:
+ * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
+ * - Value: page id.
+ */
+
 /**
 * Function to clean the `page.sql` file by:
 * - Removing all lines that don't start with `INSERT INTO...`.
 * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
 * - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
+ * @returns {Promise<WikipediaPages>}
 */
 const cleanPagesSQL = async () => {
-  const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
-  const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES\n"
-  const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
-  const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
+  /** @type {WikipediaPages} */
+  const wikipediaPages = {}

+  const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
+  const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
  const sqlInputStat = await fs.promises.stat(sqlInputPath)
  const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
-  const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
-  await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT)

  let isInsideInsert = false
  let current = ""
@ -28,7 +35,7 @@ const cleanPagesSQL = async () => {

  return await new Promise((resolve, reject) => {
    sqlInputFileStream
-      .on("data", async (dataInput) => {
+      .on("data", (dataInput) => {
        const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
        const bytesReadPercent = bytesReadRatio * 100

@ -39,9 +46,6 @@ const cleanPagesSQL = async () => {
          lastPercent = bytesReadPercent
        }

-        /**
-         * @type {string}
-         */
        let data = current + dataInput

        if (!isInsideInsert) {
@ -74,21 +78,36 @@ const cleanPagesSQL = async () => {
          const isRedirect = row[3] === "1"

          if (namespace === "0" && !isRedirect) {
-            await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8")
+            wikipediaPages[title] = id
          }
        }
      })
-      .on("error", async (error) => {
-        await sqlOutputFile.close()
+      .on("error", (error) => {
        return reject(error)
      })
-      .on("close", async () => {
-        console.log(`Cleaned "${sqlInputPath}" to "${sqlOutputPath}".`)
-        await sqlOutputFile.appendFile(";\n", "utf-8")
-        await sqlOutputFile.close()
-        return resolve()
+      .on("close", () => {
+        return resolve(wikipediaPages)
      })
  })
 }

-await cleanPagesSQL()
+const wikipediaPages = await cleanPagesSQL()
+
+const cleanPagesSQLWriteToFile = async () => {
+  const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
+  const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
+
+  const wikipediaPagesString = Object.entries(wikipediaPages)
+    .map(([title, id]) => {
+      return `(${id},${title})`
+    })
+    .join(",")
+
+  await fs.promises.writeFile(
+    sqlOutputPath,
+    `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
+    { encoding: "utf-8" },
+  )
+}
+
+await cleanPagesSQLWriteToFile()
--- a/data/database-wikipedia.sh
+++ b/data/database-wikipedia.sh
@ -1,79 +0,0 @@
-#!/usr/bin/env bash
-
-# Usage: ./database-wikipedia.sh
-# Description: Download and extract Wikipedia database dumps.
-
-set -o errexit
-set -o nounset
-set -o pipefail
-
-DUMP_DIRECTORY="dump"
-SQL_OUTPUT_DIRECTORY="sql"
-DOWNLOAD_DATE="latest"
-WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
-
-mkdir --parents "${DUMP_DIRECTORY}"
-
-download_file() {
-  local filename="${1}"
-  local file_path_output="${DUMP_DIRECTORY}/${filename}"
-  local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
-
-  if [[ ! -f "${file_path_output}" ]]; then
-    echo "Downloading \"${filename}\" from \"${file_url}\"..."
-    wget --output-document="${file_path_output}" "${file_url}"
-  else
-    echo "File \"${filename}\" from \"${file_url}\" already exists."
-  fi
-}
-
-# download_file "page.sql.gz"
-# download_file "pagelinks.sql.gz"
-
-extract_file() {
-  local filename="${1}"
-  local file_path_input="${DUMP_DIRECTORY}/${filename}"
-  local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
-
-  if [[ ! -f "${file_path_output}" ]]; then
-    echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
-    gzip --decompress "${file_path_input}"
-
-    # `--keep` flag to keep the original file, not needed here.
-    # gzip --decompress --keep "${file_path_input}"
-  else
-    echo "File \"${filename}\" already extracted."
-  fi
-}
-
-# extract_file "page.sql.gz"
-# extract_file "pagelinks.sql.gz"
-
-# Function to clean the `page.sql` file by:
-# - Removing all lines that don't start with `INSERT INTO...`.
-# - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to 0.
-# - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
-# - Replace 'INSERT INTO `page` VALUES' with 'INSERT INTO pages VALUES'.
-# - Replace escape single quote `\'` in MySQL to the PostgreSQL version `''`.
-# - Replace escape double quote `\"` in MySQL to the PostgreSQL version `"`.
-# - Handle backslashes `\\` by replacing them with a single backslash `\` for PostgreSQL.
-clean_pages_sql() {
-  local sql_input_file_directory="${1}"
-  local sql_input="${sql_input_file_directory}/page.sql"
-  local sql_output="${SQL_OUTPUT_DIRECTORY}/2-pages-inserts.sql"
-
-  sed --quiet '/^INSERT INTO `page` VALUES (/p' "${sql_input}" |
-    grep -oP "INSERT INTO \`page\` VALUES \(.+?\);" |
-    sed 's/),(/)\n(/g' |
-    grep -P "\([0-9]+,0,'.*?',0" |
-    sed -E "s/^\(([0-9]+),0,'([^']*)',0.*\)$/\1,'\2'/" |
-    sed "s/\\\'/''/g" | # Replace escaped single quotes
-    sed 's/\\"/"/g' |   # Replace escaped double quotes
-    sed 's/\\\\/\\/g' | # Replace double backslashes with a single backslash
-    awk 'BEGIN {print "INSERT INTO pages VALUES"} {print "(" $0 "),"}' |
-    sed '$ s/,$/;/g' >"$sql_output"
-
-  echo "Cleaned \"${sql_input}\" to \"${sql_output}\"."
-}
-
-# clean_pages_sql "${DUMP_DIRECTORY}"
--- a/data/download-wikipedia-dump.sh
+++ b/data/download-wikipedia-dump.sh
@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+# Usage: ./download-wikipedia-dump.sh
+# Description: Download and extract Wikipedia database dumps.
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+DUMP_DIRECTORY="dump"
+DOWNLOAD_DATE="latest"
+WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
+
+mkdir --parents "${DUMP_DIRECTORY}"
+
+download_file() {
+  local filename="${1}"
+  local file_path_output="${DUMP_DIRECTORY}/${filename}"
+  local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
+
+  if [[ ! -f "${file_path_output}" ]]; then
+    echo "Downloading \"${filename}\" from \"${file_url}\"..."
+    wget --output-document="${file_path_output}" "${file_url}"
+  else
+    echo "File \"${filename}\" from \"${file_url}\" already exists."
+  fi
+}
+
+download_file "page.sql.gz"
+download_file "pagelinks.sql.gz"
+
+extract_file() {
+  local filename="${1}"
+  local file_path_input="${DUMP_DIRECTORY}/${filename}"
+  local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
+
+  if [[ ! -f "${file_path_output}" ]]; then
+    echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
+    gzip --decompress "${file_path_input}"
+
+    # `--keep` flag to keep the original file, not needed here.
+    # gzip --decompress --keep "${file_path_input}"
+  else
+    echo "File \"${filename}\" already extracted."
+  fi
+}
+
+extract_file "page.sql.gz"
+extract_file "pagelinks.sql.gz"
--- a/data/sql/0-insert-optimizer-start.sql
+++ b/data/sql/0-insert-optimizer-start.sql
@ -1,4 +0,0 @@
-SET AUTOCOMMIT = 0;
-SET FOREIGN_KEY_CHECKS = 0;
-SET UNIQUE_CHECKS = 0;
-BEGIN;
--- a/data/sql/99-insert-optimizer-end.sql
+++ b/data/sql/99-insert-optimizer-end.sql
@ -1,4 +0,0 @@
-COMMIT;
-SET AUTOCOMMIT = 1;
-SET FOREIGN_KEY_CHECKS = 1;
-SET UNIQUE_CHECKS = 1;
--- a/data/test.js
+++ b/data/test.js
@ -1,48 +0,0 @@
-import { extractRowsFromSQLValues } from "./utils.js"
-
-console.log(
-  "output:",
-  extractRowsFromSQLValues("(1,'-)',0),(2,'Demographics_of_American_Samoa',0)"),
-)
-
-console.log(
-  "output:",
-  extractRowsFromSQLValues(
-    `(1,'-d\\'ff)',0),(2,'Demographics_of_American_Samoa',0)`,
-  ),
-)
-
-console.log(
-  "output:",
-  extractRowsFromSQLValues(
-    "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11,'abc',ddf,123,43,'dff'",
-  ),
-)
-
-console.log(
-  "output:",
-  extractRowsFromSQLValues(
-    "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11",
-  ),
-)
-
-console.log(
-  "output:",
-  extractRowsFromSQLValues(
-    "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(",
-  ),
-)
-
-console.log(
-  "output:",
-  extractRowsFromSQLValues(
-    `(1,'-)',0),(2,'C：\\\\',1,0),(2,'Demographics_of_American_Samoa',0)`,
-  ),
-)
-
-console.log(
-  "output:",
-  extractRowsFromSQLValues(
-    `(1,'-)',0),(2,'Good_Singin\\',_Good_Playin\\'',1,0),(2,'Demographics_of_American_Samoa',0)`,
-  ),
-)