chore: clean up POC to get Wikipedia dump

2024-08-05 00:52:48 +02:00 · 2024-08-05 00:52:48 +02:00 · 61914d2392
commit 61914d2392
parent 3de838dded
10 changed files with 97 additions and 345 deletions
--- a/TODO.md
+++ b/TODO.md
@ -12,11 +12,12 @@
    - [x] `page.sql` (`pages` tables)
    - [ ] `pagelinks.sql` (`internal_links` tables)
  - [x] Import SQL files
  - [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version
  - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page`
  - [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder
-  - [ ] `.gitignore` correctly + Documentation how to use + Last execution date
+  - [ ] Documentation how to use + Last execution date
- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/))
+  - [ ] Rewrite bash script to download and extract SQL files from Wikipedia Database Dump to Node.js for better cross-platform support and easier maintenance + automation, preferably one Node.js script to generate everything to create the database
  - [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version
 - [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/)) to get shortest paths between 2 pages
 - [ ] Implement Wikipedia Game Solver (`website`) with inputs, button to submit, and list all pages to go from one to another, or none if it is not possible
 - [ ] Check how to deal with redirects (+ Wikipedia Database Dump related)
 - [ ] Implement toast notifications for errors, warnings, and success messages
--- a/data/README.md
+++ b/data/README.md
@ -4,27 +4,19 @@
 Show the first 10 line of sql file: `head -n 10 ./dump/page.sql`
-To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-postgres-data'`
+To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'`
-## PostgreSQL related
+## Remove a volume
 ### Import SQL file to PostgreSQL Docker Container
 In `compose.yaml`, we can specify SQL scripts to be executed when the container starts for the first time.
 ```yaml
 volumes:
  - "./sql:/docker-entrypoint-initdb.d/"
 ```
 ### Remove a volume
 ```sh
 # List all volumes
 docker volume ls
 # Remove a volume
-docker volume rm data_wikipedia-solver-postgres-data
+docker volume rm data_wikipedia-solver-mariadb-data
 # Or by using docker compose down
 docker-compose down --volumes
 ```
 ## MySQL Related
@ -96,32 +88,4 @@ CREATE TABLE `page` (
 --
 -- Dumping data for table `page`
 --
 /*!40000 ALTER TABLE `page` DISABLE KEYS */;
 INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL);
 INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL),(12,0,'Anarchism',0,0,0.786172332974311,'20240731234111','20240731234202',1234495258,110759,'wikitext',NULL),(12281,0,'Gottfried_Wilhelm_Leibniz',0,0,0.79151204115852,'20240731234133','20240731234228',1237687724,155319,'wikitext',NULL),(13,0,'AfghanistanHistory',1,0,0.154661929211,'20240729123940','20240722220436',783865149,90,'wikitext',NULL),(14,0,'AfghanistanGeography',1,0,0.952234464653055,'20240722211426','20240722220436',783865160,92,'wikitext',NULL),(15,0,'AfghanistanPeople',1,0,0.047716566551,'20240722211426','20240722220436',783865293,95,'wikitext',NULL),(12473,1,'Gnosticism',0,0,0.00653186720472934,'20240801075011','20240731232236',1233717868,6579,'wikitext',NULL);
 -- Expected output: INSERT INTO `page` VALUES (12,'Anarchism'),(12281,'Gottfried_Wilhelm_Leibniz');
 ```
 ### PostgreSQL short version
 ```sql
 CREATE TABLE IF NOT EXISTS pages (
  id BIGSERIAL PRIMARY KEY,
  title VARCHAR(255) UNIQUE NOT NULL
  -- is_redirect BOOLEAN NOT NULL DEFAULT FALSE
 );
 -- Examples of inserts
 INSERT INTO pages VALUES (10, 'AccessibleComputing'); -- (is_redirect = true)
 INSERT INTO pages VALUES (10474, 'Eight_queens_puzzle'); -- (is_redirect = false)
 INSERT INTO pages VALUES
 (10,'AccessibleComputing'),
 (12,'Anarchism'),
 (13,'AfghanistanHistory'),
 (14,'AfghanistanGeography'),
 (15,'AfghanistanPeople');
 ```
--- a/data/compose.yaml
+++ b/data/compose.yaml
@ -1,17 +1,4 @@
 services:
  # wikipedia-solver-database:
  #   container_name: "wikipedia-solver-database"
  #   image: "postgres:16.3"
  #   restart: "unless-stopped"
  #   env_file: ".env"
  #   environment:
  #     POSTGRES_USER: ${DATABASE_USER}
  #     POSTGRES_PASSWORD: ${DATABASE_PASSWORD}
  #     POSTGRES_DB: ${DATABASE_NAME}
  #   volumes:
  #     - "wikipedia-solver-postgres-data:/var/lib/postgresql/data"
  #     - "./sql:/docker-entrypoint-initdb.d/"
  wikipedia-solver-database:
    container_name: "wikipedia-solver-database"
    image: "mariadb:10.6.17"
@ -47,25 +34,5 @@ services:
      - "./adminer/logo.png:/var/www/html/logo.png"
      - "./adminer/fonts/:/var/www/html/fonts"
  # dbgate:
  #     image: "dbgate/dbgate:5.3.3"
  #     restart: "always"
  #     ports:
  #       - "8080:3000"
  #     volumes:
  #       - "dbgate-data:/root/.dbgate"
  #     environment:
  #       CONNECTIONS: "con1"
  #       LABEL_con1: "Postgres"
  #       SERVER_con1: "wikipedia-solver-database"
  #       USER_con1: ${DATABASE_USER}
  #       PASSWORD_con1: ${DATABASE_PASSWORD}
  #       PORT_con1: 5432
  #       ENGINE_con1: "postgres@dbgate-plugin-postgres"
 volumes:
  wikipedia-solver-mariadb-data:
  # wikipedia-solver-postgres-data:
  # dbgate-data:
  #   driver: "local"
--- a/data/database-wikipedia-v2.js
+++ b/data/database-wikipedia-v2.js
@ -1,113 +0,0 @@
 import fs from "node:fs"
 import path from "node:path"
 import { extractRowsFromSQLValues } from "./utils.js"
 const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
 const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
 /**
 * @typedef {Record<string, number>} WikipediaPages
 *
 * Object to store pages from Wikipedia:
 * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
 * - Value: page id.
 */
 /**
 * Function to clean the `page.sql` file by:
 * - Removing all lines that don't start with `INSERT INTO...`.
 * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
 * - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
 * @returns {Promise<WikipediaPages>}
 */
 const cleanPagesSQL = async () => {
  /** @type {WikipediaPages} */
  const wikipediaPages = {}
  const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
  const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
  const sqlInputStat = await fs.promises.stat(sqlInputPath)
  const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
  let isInsideInsert = false
  let current = ""
  let lastPercent = 0
  return await new Promise((resolve, reject) => {
    sqlInputFileStream
      .on("data", (dataInput) => {
        const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
        const bytesReadPercent = bytesReadRatio * 100
        if (bytesReadPercent - lastPercent >= 1) {
          console.log(
            `Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
          )
          lastPercent = bytesReadPercent
        }
        let data = current + dataInput
        if (!isInsideInsert) {
          const lines = data.split("\n").filter((line) => {
            return line.startsWith(INSERT_INTO_START_INPUT)
          })
          const [line] = lines
          if (line == null) {
            sqlInputFileStream.close()
            return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
          }
          isInsideInsert = true
          const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
          data = lineStripped
        }
        const { rows, unCompleted } = extractRowsFromSQLValues(data)
        current = unCompleted
        for (const row of rows) {
          if (row.length !== 12) {
            sqlInputFileStream.close()
            console.error([row])
            return reject(new Error(`Invalid Row values.`))
          }
          const id = Number.parseInt(row[0] ?? "0", 10)
          const namespace = row[1] ?? ""
          const title = row[2] ?? ""
          const isRedirect = row[3] === "1"
          if (namespace === "0" && !isRedirect) {
            wikipediaPages[title] = id
          }
        }
      })
      .on("error", (error) => {
        return reject(error)
      })
      .on("close", () => {
        return resolve(wikipediaPages)
      })
  })
 }
 const wikipediaPages = await cleanPagesSQL()
 const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
 const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
 const wikipediaPagesString = Object.entries(wikipediaPages)
  .map(([title, id]) => {
    return `(${id},${title})`
  })
  .join(",")
 await fs.promises.writeFile(
  sqlOutputPath,
  `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
  { encoding: "utf-8" },
 )
 // const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
 // await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT)
 // await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8")
--- a/data/database-wikipedia.js
+++ b/data/database-wikipedia.js
@ -5,22 +5,29 @@ import { extractRowsFromSQLValues } from "./utils.js"
 const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
 const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
 /**
 * @typedef {Record<string, number>} WikipediaPages
 *
 * Object to store pages from Wikipedia:
 * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
 * - Value: page id.
 */
 /**
 * Function to clean the `page.sql` file by:
 * - Removing all lines that don't start with `INSERT INTO...`.
 * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
 * - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
 * @returns {Promise<WikipediaPages>}
 */
 const cleanPagesSQL = async () => {
-  const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
+  /** @type {WikipediaPages} */
-  const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES\n"
+  const wikipediaPages = {}
  const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
  const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
  const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
  const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
  const sqlInputStat = await fs.promises.stat(sqlInputPath)
  const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
  const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
  await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT)
  let isInsideInsert = false
  let current = ""
@ -28,7 +35,7 @@ const cleanPagesSQL = async () => {
  return await new Promise((resolve, reject) => {
    sqlInputFileStream
-      .on("data", async (dataInput) => {
+      .on("data", (dataInput) => {
        const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
        const bytesReadPercent = bytesReadRatio * 100
@ -39,9 +46,6 @@ const cleanPagesSQL = async () => {
          lastPercent = bytesReadPercent
        }
        /**
         * @type {string}
         */
        let data = current + dataInput
        if (!isInsideInsert) {
@ -74,21 +78,36 @@ const cleanPagesSQL = async () => {
          const isRedirect = row[3] === "1"
          if (namespace === "0" && !isRedirect) {
-            await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8")
+            wikipediaPages[title] = id
          }
        }
      })
-      .on("error", async (error) => {
+      .on("error", (error) => {
        await sqlOutputFile.close()
        return reject(error)
      })
-      .on("close", async () => {
+      .on("close", () => {
-        console.log(`Cleaned "${sqlInputPath}" to "${sqlOutputPath}".`)
+        return resolve(wikipediaPages)
        await sqlOutputFile.appendFile(";\n", "utf-8")
        await sqlOutputFile.close()
        return resolve()
      })
  })
 }
-await cleanPagesSQL()
+const wikipediaPages = await cleanPagesSQL()
 const cleanPagesSQLWriteToFile = async () => {
  const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
  const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
  const wikipediaPagesString = Object.entries(wikipediaPages)
    .map(([title, id]) => {
      return `(${id},${title})`
    })
    .join(",")
  await fs.promises.writeFile(
    sqlOutputPath,
    `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
    { encoding: "utf-8" },
  )
 }
 await cleanPagesSQLWriteToFile()
--- a/data/database-wikipedia.sh
+++ b/data/database-wikipedia.sh
@ -1,79 +0,0 @@
 #!/usr/bin/env bash
 # Usage: ./database-wikipedia.sh
 # Description: Download and extract Wikipedia database dumps.
 set -o errexit
 set -o nounset
 set -o pipefail
 DUMP_DIRECTORY="dump"
 SQL_OUTPUT_DIRECTORY="sql"
 DOWNLOAD_DATE="latest"
 WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
 mkdir --parents "${DUMP_DIRECTORY}"
 download_file() {
  local filename="${1}"
  local file_path_output="${DUMP_DIRECTORY}/${filename}"
  local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
  if [[ ! -f "${file_path_output}" ]]; then
    echo "Downloading \"${filename}\" from \"${file_url}\"..."
    wget --output-document="${file_path_output}" "${file_url}"
  else
    echo "File \"${filename}\" from \"${file_url}\" already exists."
  fi
 }
 # download_file "page.sql.gz"
 # download_file "pagelinks.sql.gz"
 extract_file() {
  local filename="${1}"
  local file_path_input="${DUMP_DIRECTORY}/${filename}"
  local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
  if [[ ! -f "${file_path_output}" ]]; then
    echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
    gzip --decompress "${file_path_input}"
    # `--keep` flag to keep the original file, not needed here.
    # gzip --decompress --keep "${file_path_input}"
  else
    echo "File \"${filename}\" already extracted."
  fi
 }
 # extract_file "page.sql.gz"
 # extract_file "pagelinks.sql.gz"
 # Function to clean the `page.sql` file by:
 # - Removing all lines that don't start with `INSERT INTO...`.
 # - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to 0.
 # - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
 # - Replace 'INSERT INTO `page` VALUES' with 'INSERT INTO pages VALUES'.
 # - Replace escape single quote `\'` in MySQL to the PostgreSQL version `''`.
 # - Replace escape double quote `\"` in MySQL to the PostgreSQL version `"`.
 # - Handle backslashes `\\` by replacing them with a single backslash `\` for PostgreSQL.
 clean_pages_sql() {
  local sql_input_file_directory="${1}"
  local sql_input="${sql_input_file_directory}/page.sql"
  local sql_output="${SQL_OUTPUT_DIRECTORY}/2-pages-inserts.sql"
  sed --quiet '/^INSERT INTO `page` VALUES (/p' "${sql_input}" |
    grep -oP "INSERT INTO \`page\` VALUES \(.+?\);" |
    sed 's/),(/)\n(/g' |
    grep -P "\([0-9]+,0,'.*?',0" |
    sed -E "s/^\(([0-9]+),0,'([^']*)',0.*\)$/\1,'\2'/" |
    sed "s/\\\'/''/g" | # Replace escaped single quotes
    sed 's/\\"/"/g' |   # Replace escaped double quotes
    sed 's/\\\\/\\/g' | # Replace double backslashes with a single backslash
    awk 'BEGIN {print "INSERT INTO pages VALUES"} {print "(" $0 "),"}' |
    sed '$ s/,$/;/g' >"$sql_output"
  echo "Cleaned \"${sql_input}\" to \"${sql_output}\"."
 }
 # clean_pages_sql "${DUMP_DIRECTORY}"
--- a/data/download-wikipedia-dump.sh
+++ b/data/download-wikipedia-dump.sh
@ -0,0 +1,49 @@
 #!/usr/bin/env bash
 # Usage: ./download-wikipedia-dump.sh
 # Description: Download and extract Wikipedia database dumps.
 set -o errexit
 set -o nounset
 set -o pipefail
 DUMP_DIRECTORY="dump"
 DOWNLOAD_DATE="latest"
 WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
 mkdir --parents "${DUMP_DIRECTORY}"
 download_file() {
  local filename="${1}"
  local file_path_output="${DUMP_DIRECTORY}/${filename}"
  local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
  if [[ ! -f "${file_path_output}" ]]; then
    echo "Downloading \"${filename}\" from \"${file_url}\"..."
    wget --output-document="${file_path_output}" "${file_url}"
  else
    echo "File \"${filename}\" from \"${file_url}\" already exists."
  fi
 }
 download_file "page.sql.gz"
 download_file "pagelinks.sql.gz"
 extract_file() {
  local filename="${1}"
  local file_path_input="${DUMP_DIRECTORY}/${filename}"
  local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
  if [[ ! -f "${file_path_output}" ]]; then
    echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
    gzip --decompress "${file_path_input}"
    # `--keep` flag to keep the original file, not needed here.
    # gzip --decompress --keep "${file_path_input}"
  else
    echo "File \"${filename}\" already extracted."
  fi
 }
 extract_file "page.sql.gz"
 extract_file "pagelinks.sql.gz"
--- a/data/sql/0-insert-optimizer-start.sql
+++ b/data/sql/0-insert-optimizer-start.sql
@ -1,4 +0,0 @@
 SET AUTOCOMMIT = 0;
 SET FOREIGN_KEY_CHECKS = 0;
 SET UNIQUE_CHECKS = 0;
 BEGIN;
--- a/data/sql/99-insert-optimizer-end.sql
+++ b/data/sql/99-insert-optimizer-end.sql
@ -1,4 +0,0 @@
 COMMIT;
 SET AUTOCOMMIT = 1;
 SET FOREIGN_KEY_CHECKS = 1;
 SET UNIQUE_CHECKS = 1;
--- a/data/test.js
+++ b/data/test.js
@ -1,48 +0,0 @@
 import { extractRowsFromSQLValues } from "./utils.js"
 console.log(
  "output:",
  extractRowsFromSQLValues("(1,'-)',0),(2,'Demographics_of_American_Samoa',0)"),
 )
 console.log(
  "output:",
  extractRowsFromSQLValues(
    `(1,'-d\\'ff)',0),(2,'Demographics_of_American_Samoa',0)`,
  ),
 )
 console.log(
  "output:",
  extractRowsFromSQLValues(
    "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11,'abc',ddf,123,43,'dff'",
  ),
 )
 console.log(
  "output:",
  extractRowsFromSQLValues(
    "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11",
  ),
 )
 console.log(
  "output:",
  extractRowsFromSQLValues(
    "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(",
  ),
 )
 console.log(
  "output:",
  extractRowsFromSQLValues(
    `(1,'-)',0),(2,'C：\\\\',1,0),(2,'Demographics_of_American_Samoa',0)`,
  ),
 )
 console.log(
  "output:",
  extractRowsFromSQLValues(
    `(1,'-)',0),(2,'Good_Singin\\',_Good_Playin\\'',1,0),(2,'Demographics_of_American_Samoa',0)`,
  ),
 )