diff --git a/TODO.md b/TODO.md index c7974ed..0dfcacc 100644 --- a/TODO.md +++ b/TODO.md @@ -12,11 +12,12 @@ - [x] `page.sql` (`pages` tables) - [ ] `pagelinks.sql` (`internal_links` tables) - [x] Import SQL files - - [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version - [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page` - [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder - - [ ] `.gitignore` correctly + Documentation how to use + Last execution date -- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/)) + - [ ] Documentation how to use + Last execution date + - [ ] Rewrite bash script to download and extract SQL files from Wikipedia Database Dump to Node.js for better cross-platform support and easier maintenance + automation, preferably one Node.js script to generate everything to create the database + - [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version +- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/)) to get shortest paths between 2 pages - [ ] Implement Wikipedia Game Solver (`website`) with inputs, button to submit, and list all pages to go from one to another, or none if it is not possible - [ ] Check how to deal with redirects (+ Wikipedia Database Dump related) - [ ] Implement toast notifications for errors, warnings, and success messages diff --git a/data/README.md b/data/README.md index 88d3c29..0f114e1 100644 --- a/data/README.md +++ b/data/README.md @@ -4,27 +4,19 @@ Show the first 10 line of sql file: `head -n 10 ./dump/page.sql` -To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-postgres-data'` +To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'` -## PostgreSQL related - -### Import SQL file to PostgreSQL Docker Container - -In `compose.yaml`, we can specify SQL scripts to be executed when the container starts for the first time. - -```yaml -volumes: - - "./sql:/docker-entrypoint-initdb.d/" -``` - -### Remove a volume +## Remove a volume ```sh # List all volumes docker volume ls # Remove a volume -docker volume rm data_wikipedia-solver-postgres-data +docker volume rm data_wikipedia-solver-mariadb-data + +# Or by using docker compose down +docker-compose down --volumes ``` ## MySQL Related @@ -96,32 +88,4 @@ CREATE TABLE `page` ( -- -- Dumping data for table `page` -- - -/*!40000 ALTER TABLE `page` DISABLE KEYS */; -INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL); - -INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL),(12,0,'Anarchism',0,0,0.786172332974311,'20240731234111','20240731234202',1234495258,110759,'wikitext',NULL),(12281,0,'Gottfried_Wilhelm_Leibniz',0,0,0.79151204115852,'20240731234133','20240731234228',1237687724,155319,'wikitext',NULL),(13,0,'AfghanistanHistory',1,0,0.154661929211,'20240729123940','20240722220436',783865149,90,'wikitext',NULL),(14,0,'AfghanistanGeography',1,0,0.952234464653055,'20240722211426','20240722220436',783865160,92,'wikitext',NULL),(15,0,'AfghanistanPeople',1,0,0.047716566551,'20240722211426','20240722220436',783865293,95,'wikitext',NULL),(12473,1,'Gnosticism',0,0,0.00653186720472934,'20240801075011','20240731232236',1233717868,6579,'wikitext',NULL); --- Expected output: INSERT INTO `page` VALUES (12,'Anarchism'),(12281,'Gottfried_Wilhelm_Leibniz'); -``` - -### PostgreSQL short version - -```sql -CREATE TABLE IF NOT EXISTS pages ( - id BIGSERIAL PRIMARY KEY, - title VARCHAR(255) UNIQUE NOT NULL - - -- is_redirect BOOLEAN NOT NULL DEFAULT FALSE -); - --- Examples of inserts -INSERT INTO pages VALUES (10, 'AccessibleComputing'); -- (is_redirect = true) -INSERT INTO pages VALUES (10474, 'Eight_queens_puzzle'); -- (is_redirect = false) - -INSERT INTO pages VALUES -(10,'AccessibleComputing'), -(12,'Anarchism'), -(13,'AfghanistanHistory'), -(14,'AfghanistanGeography'), -(15,'AfghanistanPeople'); ``` diff --git a/data/compose.yaml b/data/compose.yaml index 8e728b0..a1c0b3e 100644 --- a/data/compose.yaml +++ b/data/compose.yaml @@ -1,17 +1,4 @@ services: - # wikipedia-solver-database: - # container_name: "wikipedia-solver-database" - # image: "postgres:16.3" - # restart: "unless-stopped" - # env_file: ".env" - # environment: - # POSTGRES_USER: ${DATABASE_USER} - # POSTGRES_PASSWORD: ${DATABASE_PASSWORD} - # POSTGRES_DB: ${DATABASE_NAME} - # volumes: - # - "wikipedia-solver-postgres-data:/var/lib/postgresql/data" - # - "./sql:/docker-entrypoint-initdb.d/" - wikipedia-solver-database: container_name: "wikipedia-solver-database" image: "mariadb:10.6.17" @@ -47,25 +34,5 @@ services: - "./adminer/logo.png:/var/www/html/logo.png" - "./adminer/fonts/:/var/www/html/fonts" - # dbgate: - # image: "dbgate/dbgate:5.3.3" - # restart: "always" - # ports: - # - "8080:3000" - # volumes: - # - "dbgate-data:/root/.dbgate" - # environment: - # CONNECTIONS: "con1" - - # LABEL_con1: "Postgres" - # SERVER_con1: "wikipedia-solver-database" - # USER_con1: ${DATABASE_USER} - # PASSWORD_con1: ${DATABASE_PASSWORD} - # PORT_con1: 5432 - # ENGINE_con1: "postgres@dbgate-plugin-postgres" - volumes: wikipedia-solver-mariadb-data: - # wikipedia-solver-postgres-data: - # dbgate-data: - # driver: "local" diff --git a/data/database-wikipedia-v2.js b/data/database-wikipedia-v2.js deleted file mode 100644 index bcb63a2..0000000 --- a/data/database-wikipedia-v2.js +++ /dev/null @@ -1,113 +0,0 @@ -import fs from "node:fs" -import path from "node:path" -import { extractRowsFromSQLValues } from "./utils.js" - -const SQL_DUMP_PATH = path.join(process.cwd(), "dump") -const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql") - -/** - * @typedef {Record} WikipediaPages - * - * Object to store pages from Wikipedia: - * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ). - * - Value: page id. - */ - -/** - * Function to clean the `page.sql` file by: - * - Removing all lines that don't start with `INSERT INTO...`. - * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0). - * - Only keep columns `page_id` (1st column) and `page_title` (3rd column). - * @returns {Promise} - */ -const cleanPagesSQL = async () => { - /** @type {WikipediaPages} */ - const wikipediaPages = {} - - const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES " - const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql") - const sqlInputStat = await fs.promises.stat(sqlInputPath) - const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8") - - let isInsideInsert = false - let current = "" - let lastPercent = 0 - - return await new Promise((resolve, reject) => { - sqlInputFileStream - .on("data", (dataInput) => { - const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size - const bytesReadPercent = bytesReadRatio * 100 - - if (bytesReadPercent - lastPercent >= 1) { - console.log( - `Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`, - ) - lastPercent = bytesReadPercent - } - - let data = current + dataInput - - if (!isInsideInsert) { - const lines = data.split("\n").filter((line) => { - return line.startsWith(INSERT_INTO_START_INPUT) - }) - const [line] = lines - if (line == null) { - sqlInputFileStream.close() - return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`)) - } - isInsideInsert = true - const lineStripped = line.slice(INSERT_INTO_START_INPUT.length) - data = lineStripped - } - - const { rows, unCompleted } = extractRowsFromSQLValues(data) - current = unCompleted - - for (const row of rows) { - if (row.length !== 12) { - sqlInputFileStream.close() - console.error([row]) - return reject(new Error(`Invalid Row values.`)) - } - - const id = Number.parseInt(row[0] ?? "0", 10) - const namespace = row[1] ?? "" - const title = row[2] ?? "" - const isRedirect = row[3] === "1" - - if (namespace === "0" && !isRedirect) { - wikipediaPages[title] = id - } - } - }) - .on("error", (error) => { - return reject(error) - }) - .on("close", () => { - return resolve(wikipediaPages) - }) - }) -} - -const wikipediaPages = await cleanPagesSQL() - -const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") -const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES " - -const wikipediaPagesString = Object.entries(wikipediaPages) - .map(([title, id]) => { - return `(${id},${title})` - }) - .join(",") - -await fs.promises.writeFile( - sqlOutputPath, - `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`, - { encoding: "utf-8" }, -) - -// const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w") -// await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT) -// await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8") diff --git a/data/database-wikipedia.js b/data/database-wikipedia.js index a41fa5c..c4fddb1 100644 --- a/data/database-wikipedia.js +++ b/data/database-wikipedia.js @@ -5,22 +5,29 @@ import { extractRowsFromSQLValues } from "./utils.js" const SQL_DUMP_PATH = path.join(process.cwd(), "dump") const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql") +/** + * @typedef {Record} WikipediaPages + * + * Object to store pages from Wikipedia: + * - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ). + * - Value: page id. + */ + /** * Function to clean the `page.sql` file by: * - Removing all lines that don't start with `INSERT INTO...`. * - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0). * - Only keep columns `page_id` (1st column) and `page_title` (3rd column). + * @returns {Promise} */ const cleanPagesSQL = async () => { - const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES " - const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES\n" - const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql") - const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") + /** @type {WikipediaPages} */ + const wikipediaPages = {} + const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES " + const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql") const sqlInputStat = await fs.promises.stat(sqlInputPath) const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8") - const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w") - await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT) let isInsideInsert = false let current = "" @@ -28,7 +35,7 @@ const cleanPagesSQL = async () => { return await new Promise((resolve, reject) => { sqlInputFileStream - .on("data", async (dataInput) => { + .on("data", (dataInput) => { const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size const bytesReadPercent = bytesReadRatio * 100 @@ -39,9 +46,6 @@ const cleanPagesSQL = async () => { lastPercent = bytesReadPercent } - /** - * @type {string} - */ let data = current + dataInput if (!isInsideInsert) { @@ -74,21 +78,36 @@ const cleanPagesSQL = async () => { const isRedirect = row[3] === "1" if (namespace === "0" && !isRedirect) { - await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8") + wikipediaPages[title] = id } } }) - .on("error", async (error) => { - await sqlOutputFile.close() + .on("error", (error) => { return reject(error) }) - .on("close", async () => { - console.log(`Cleaned "${sqlInputPath}" to "${sqlOutputPath}".`) - await sqlOutputFile.appendFile(";\n", "utf-8") - await sqlOutputFile.close() - return resolve() + .on("close", () => { + return resolve(wikipediaPages) }) }) } -await cleanPagesSQL() +const wikipediaPages = await cleanPagesSQL() + +const cleanPagesSQLWriteToFile = async () => { + const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql") + const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES " + + const wikipediaPagesString = Object.entries(wikipediaPages) + .map(([title, id]) => { + return `(${id},${title})` + }) + .join(",") + + await fs.promises.writeFile( + sqlOutputPath, + `${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`, + { encoding: "utf-8" }, + ) +} + +await cleanPagesSQLWriteToFile() diff --git a/data/database-wikipedia.sh b/data/database-wikipedia.sh deleted file mode 100755 index d89c02d..0000000 --- a/data/database-wikipedia.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env bash - -# Usage: ./database-wikipedia.sh -# Description: Download and extract Wikipedia database dumps. - -set -o errexit -set -o nounset -set -o pipefail - -DUMP_DIRECTORY="dump" -SQL_OUTPUT_DIRECTORY="sql" -DOWNLOAD_DATE="latest" -WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-" - -mkdir --parents "${DUMP_DIRECTORY}" - -download_file() { - local filename="${1}" - local file_path_output="${DUMP_DIRECTORY}/${filename}" - local file_url="${WIKIPEDIA_DUMP_URL}${filename}" - - if [[ ! -f "${file_path_output}" ]]; then - echo "Downloading \"${filename}\" from \"${file_url}\"..." - wget --output-document="${file_path_output}" "${file_url}" - else - echo "File \"${filename}\" from \"${file_url}\" already exists." - fi -} - -# download_file "page.sql.gz" -# download_file "pagelinks.sql.gz" - -extract_file() { - local filename="${1}" - local file_path_input="${DUMP_DIRECTORY}/${filename}" - local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}" - - if [[ ! -f "${file_path_output}" ]]; then - echo "Extracting \"${filename}\" to \"${file_path_output}\"..." - gzip --decompress "${file_path_input}" - - # `--keep` flag to keep the original file, not needed here. - # gzip --decompress --keep "${file_path_input}" - else - echo "File \"${filename}\" already extracted." - fi -} - -# extract_file "page.sql.gz" -# extract_file "pagelinks.sql.gz" - -# Function to clean the `page.sql` file by: -# - Removing all lines that don't start with `INSERT INTO...`. -# - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to 0. -# - Only keep columns `page_id` (1st column) and `page_title` (3rd column). -# - Replace 'INSERT INTO `page` VALUES' with 'INSERT INTO pages VALUES'. -# - Replace escape single quote `\'` in MySQL to the PostgreSQL version `''`. -# - Replace escape double quote `\"` in MySQL to the PostgreSQL version `"`. -# - Handle backslashes `\\` by replacing them with a single backslash `\` for PostgreSQL. -clean_pages_sql() { - local sql_input_file_directory="${1}" - local sql_input="${sql_input_file_directory}/page.sql" - local sql_output="${SQL_OUTPUT_DIRECTORY}/2-pages-inserts.sql" - - sed --quiet '/^INSERT INTO `page` VALUES (/p' "${sql_input}" | - grep -oP "INSERT INTO \`page\` VALUES \(.+?\);" | - sed 's/),(/)\n(/g' | - grep -P "\([0-9]+,0,'.*?',0" | - sed -E "s/^\(([0-9]+),0,'([^']*)',0.*\)$/\1,'\2'/" | - sed "s/\\\'/''/g" | # Replace escaped single quotes - sed 's/\\"/"/g' | # Replace escaped double quotes - sed 's/\\\\/\\/g' | # Replace double backslashes with a single backslash - awk 'BEGIN {print "INSERT INTO pages VALUES"} {print "(" $0 "),"}' | - sed '$ s/,$/;/g' >"$sql_output" - - echo "Cleaned \"${sql_input}\" to \"${sql_output}\"." -} - -# clean_pages_sql "${DUMP_DIRECTORY}" diff --git a/data/download-wikipedia-dump.sh b/data/download-wikipedia-dump.sh new file mode 100755 index 0000000..1fa1887 --- /dev/null +++ b/data/download-wikipedia-dump.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +# Usage: ./download-wikipedia-dump.sh +# Description: Download and extract Wikipedia database dumps. + +set -o errexit +set -o nounset +set -o pipefail + +DUMP_DIRECTORY="dump" +DOWNLOAD_DATE="latest" +WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-" + +mkdir --parents "${DUMP_DIRECTORY}" + +download_file() { + local filename="${1}" + local file_path_output="${DUMP_DIRECTORY}/${filename}" + local file_url="${WIKIPEDIA_DUMP_URL}${filename}" + + if [[ ! -f "${file_path_output}" ]]; then + echo "Downloading \"${filename}\" from \"${file_url}\"..." + wget --output-document="${file_path_output}" "${file_url}" + else + echo "File \"${filename}\" from \"${file_url}\" already exists." + fi +} + +download_file "page.sql.gz" +download_file "pagelinks.sql.gz" + +extract_file() { + local filename="${1}" + local file_path_input="${DUMP_DIRECTORY}/${filename}" + local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}" + + if [[ ! -f "${file_path_output}" ]]; then + echo "Extracting \"${filename}\" to \"${file_path_output}\"..." + gzip --decompress "${file_path_input}" + + # `--keep` flag to keep the original file, not needed here. + # gzip --decompress --keep "${file_path_input}" + else + echo "File \"${filename}\" already extracted." + fi +} + +extract_file "page.sql.gz" +extract_file "pagelinks.sql.gz" diff --git a/data/sql/0-insert-optimizer-start.sql b/data/sql/0-insert-optimizer-start.sql deleted file mode 100644 index 896385a..0000000 --- a/data/sql/0-insert-optimizer-start.sql +++ /dev/null @@ -1,4 +0,0 @@ -SET AUTOCOMMIT = 0; -SET FOREIGN_KEY_CHECKS = 0; -SET UNIQUE_CHECKS = 0; -BEGIN; diff --git a/data/sql/99-insert-optimizer-end.sql b/data/sql/99-insert-optimizer-end.sql deleted file mode 100644 index e724937..0000000 --- a/data/sql/99-insert-optimizer-end.sql +++ /dev/null @@ -1,4 +0,0 @@ -COMMIT; -SET AUTOCOMMIT = 1; -SET FOREIGN_KEY_CHECKS = 1; -SET UNIQUE_CHECKS = 1; diff --git a/data/test.js b/data/test.js deleted file mode 100644 index f9e8715..0000000 --- a/data/test.js +++ /dev/null @@ -1,48 +0,0 @@ -import { extractRowsFromSQLValues } from "./utils.js" - -console.log( - "output:", - extractRowsFromSQLValues("(1,'-)',0),(2,'Demographics_of_American_Samoa',0)"), -) - -console.log( - "output:", - extractRowsFromSQLValues( - `(1,'-d\\'ff)',0),(2,'Demographics_of_American_Samoa',0)`, - ), -) - -console.log( - "output:", - extractRowsFromSQLValues( - "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11,'abc',ddf,123,43,'dff'", - ), -) - -console.log( - "output:", - extractRowsFromSQLValues( - "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11", - ), -) - -console.log( - "output:", - extractRowsFromSQLValues( - "(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(", - ), -) - -console.log( - "output:", - extractRowsFromSQLValues( - `(1,'-)',0),(2,'C:\\\\',1,0),(2,'Demographics_of_American_Samoa',0)`, - ), -) - -console.log( - "output:", - extractRowsFromSQLValues( - `(1,'-)',0),(2,'Good_Singin\\',_Good_Playin\\'',1,0),(2,'Demographics_of_American_Samoa',0)`, - ), -)