chore: clean up POC to get Wikipedia dump

This commit is contained in:
Théo LUDWIG 2024-08-05 00:52:48 +02:00
parent 3de838dded
commit 61914d2392
Signed by: theoludwig
GPG Key ID: ADFE5A563D718F3B
10 changed files with 97 additions and 345 deletions

View File

@ -12,11 +12,12 @@
- [x] `page.sql` (`pages` tables)
- [ ] `pagelinks.sql` (`internal_links` tables)
- [x] Import SQL files
- [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version
- [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page`
- [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder
- [ ] `.gitignore` correctly + Documentation how to use + Last execution date
- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/))
- [ ] Documentation how to use + Last execution date
- [ ] Rewrite bash script to download and extract SQL files from Wikipedia Database Dump to Node.js for better cross-platform support and easier maintenance + automation, preferably one Node.js script to generate everything to create the database
- [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version
- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/)) to get shortest paths between 2 pages
- [ ] Implement Wikipedia Game Solver (`website`) with inputs, button to submit, and list all pages to go from one to another, or none if it is not possible
- [ ] Check how to deal with redirects (+ Wikipedia Database Dump related)
- [ ] Implement toast notifications for errors, warnings, and success messages

View File

@ -4,27 +4,19 @@
Show the first 10 line of sql file: `head -n 10 ./dump/page.sql`
To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-postgres-data'`
To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'`
## PostgreSQL related
### Import SQL file to PostgreSQL Docker Container
In `compose.yaml`, we can specify SQL scripts to be executed when the container starts for the first time.
```yaml
volumes:
- "./sql:/docker-entrypoint-initdb.d/"
```
### Remove a volume
## Remove a volume
```sh
# List all volumes
docker volume ls
# Remove a volume
docker volume rm data_wikipedia-solver-postgres-data
docker volume rm data_wikipedia-solver-mariadb-data
# Or by using docker compose down
docker-compose down --volumes
```
## MySQL Related
@ -96,32 +88,4 @@ CREATE TABLE `page` (
--
-- Dumping data for table `page`
--
/*!40000 ALTER TABLE `page` DISABLE KEYS */;
INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL);
INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL),(12,0,'Anarchism',0,0,0.786172332974311,'20240731234111','20240731234202',1234495258,110759,'wikitext',NULL),(12281,0,'Gottfried_Wilhelm_Leibniz',0,0,0.79151204115852,'20240731234133','20240731234228',1237687724,155319,'wikitext',NULL),(13,0,'AfghanistanHistory',1,0,0.154661929211,'20240729123940','20240722220436',783865149,90,'wikitext',NULL),(14,0,'AfghanistanGeography',1,0,0.952234464653055,'20240722211426','20240722220436',783865160,92,'wikitext',NULL),(15,0,'AfghanistanPeople',1,0,0.047716566551,'20240722211426','20240722220436',783865293,95,'wikitext',NULL),(12473,1,'Gnosticism',0,0,0.00653186720472934,'20240801075011','20240731232236',1233717868,6579,'wikitext',NULL);
-- Expected output: INSERT INTO `page` VALUES (12,'Anarchism'),(12281,'Gottfried_Wilhelm_Leibniz');
```
### PostgreSQL short version
```sql
CREATE TABLE IF NOT EXISTS pages (
id BIGSERIAL PRIMARY KEY,
title VARCHAR(255) UNIQUE NOT NULL
-- is_redirect BOOLEAN NOT NULL DEFAULT FALSE
);
-- Examples of inserts
INSERT INTO pages VALUES (10, 'AccessibleComputing'); -- (is_redirect = true)
INSERT INTO pages VALUES (10474, 'Eight_queens_puzzle'); -- (is_redirect = false)
INSERT INTO pages VALUES
(10,'AccessibleComputing'),
(12,'Anarchism'),
(13,'AfghanistanHistory'),
(14,'AfghanistanGeography'),
(15,'AfghanistanPeople');
```

View File

@ -1,17 +1,4 @@
services:
# wikipedia-solver-database:
# container_name: "wikipedia-solver-database"
# image: "postgres:16.3"
# restart: "unless-stopped"
# env_file: ".env"
# environment:
# POSTGRES_USER: ${DATABASE_USER}
# POSTGRES_PASSWORD: ${DATABASE_PASSWORD}
# POSTGRES_DB: ${DATABASE_NAME}
# volumes:
# - "wikipedia-solver-postgres-data:/var/lib/postgresql/data"
# - "./sql:/docker-entrypoint-initdb.d/"
wikipedia-solver-database:
container_name: "wikipedia-solver-database"
image: "mariadb:10.6.17"
@ -47,25 +34,5 @@ services:
- "./adminer/logo.png:/var/www/html/logo.png"
- "./adminer/fonts/:/var/www/html/fonts"
# dbgate:
# image: "dbgate/dbgate:5.3.3"
# restart: "always"
# ports:
# - "8080:3000"
# volumes:
# - "dbgate-data:/root/.dbgate"
# environment:
# CONNECTIONS: "con1"
# LABEL_con1: "Postgres"
# SERVER_con1: "wikipedia-solver-database"
# USER_con1: ${DATABASE_USER}
# PASSWORD_con1: ${DATABASE_PASSWORD}
# PORT_con1: 5432
# ENGINE_con1: "postgres@dbgate-plugin-postgres"
volumes:
wikipedia-solver-mariadb-data:
# wikipedia-solver-postgres-data:
# dbgate-data:
# driver: "local"

View File

@ -1,113 +0,0 @@
import fs from "node:fs"
import path from "node:path"
import { extractRowsFromSQLValues } from "./utils.js"
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
/**
* @typedef {Record<string, number>} WikipediaPages
*
* Object to store pages from Wikipedia:
* - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
* - Value: page id.
*/
/**
* Function to clean the `page.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`.
* - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
* - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
* @returns {Promise<WikipediaPages>}
*/
const cleanPagesSQL = async () => {
/** @type {WikipediaPages} */
const wikipediaPages = {}
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
const sqlInputStat = await fs.promises.stat(sqlInputPath)
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
let isInsideInsert = false
let current = ""
let lastPercent = 0
return await new Promise((resolve, reject) => {
sqlInputFileStream
.on("data", (dataInput) => {
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
const bytesReadPercent = bytesReadRatio * 100
if (bytesReadPercent - lastPercent >= 1) {
console.log(
`Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
)
lastPercent = bytesReadPercent
}
let data = current + dataInput
if (!isInsideInsert) {
const lines = data.split("\n").filter((line) => {
return line.startsWith(INSERT_INTO_START_INPUT)
})
const [line] = lines
if (line == null) {
sqlInputFileStream.close()
return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
}
isInsideInsert = true
const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
data = lineStripped
}
const { rows, unCompleted } = extractRowsFromSQLValues(data)
current = unCompleted
for (const row of rows) {
if (row.length !== 12) {
sqlInputFileStream.close()
console.error([row])
return reject(new Error(`Invalid Row values.`))
}
const id = Number.parseInt(row[0] ?? "0", 10)
const namespace = row[1] ?? ""
const title = row[2] ?? ""
const isRedirect = row[3] === "1"
if (namespace === "0" && !isRedirect) {
wikipediaPages[title] = id
}
}
})
.on("error", (error) => {
return reject(error)
})
.on("close", () => {
return resolve(wikipediaPages)
})
})
}
const wikipediaPages = await cleanPagesSQL()
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
const wikipediaPagesString = Object.entries(wikipediaPages)
.map(([title, id]) => {
return `(${id},${title})`
})
.join(",")
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
{ encoding: "utf-8" },
)
// const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
// await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT)
// await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8")

View File

@ -5,22 +5,29 @@ import { extractRowsFromSQLValues } from "./utils.js"
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
/**
* @typedef {Record<string, number>} WikipediaPages
*
* Object to store pages from Wikipedia:
* - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
* - Value: page id.
*/
/**
* Function to clean the `page.sql` file by:
* - Removing all lines that don't start with `INSERT INTO...`.
* - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
* - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
* @returns {Promise<WikipediaPages>}
*/
const cleanPagesSQL = async () => {
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES\n"
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
/** @type {WikipediaPages} */
const wikipediaPages = {}
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
const sqlInputStat = await fs.promises.stat(sqlInputPath)
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT)
let isInsideInsert = false
let current = ""
@ -28,7 +35,7 @@ const cleanPagesSQL = async () => {
return await new Promise((resolve, reject) => {
sqlInputFileStream
.on("data", async (dataInput) => {
.on("data", (dataInput) => {
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
const bytesReadPercent = bytesReadRatio * 100
@ -39,9 +46,6 @@ const cleanPagesSQL = async () => {
lastPercent = bytesReadPercent
}
/**
* @type {string}
*/
let data = current + dataInput
if (!isInsideInsert) {
@ -74,21 +78,36 @@ const cleanPagesSQL = async () => {
const isRedirect = row[3] === "1"
if (namespace === "0" && !isRedirect) {
await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8")
wikipediaPages[title] = id
}
}
})
.on("error", async (error) => {
await sqlOutputFile.close()
.on("error", (error) => {
return reject(error)
})
.on("close", async () => {
console.log(`Cleaned "${sqlInputPath}" to "${sqlOutputPath}".`)
await sqlOutputFile.appendFile(";\n", "utf-8")
await sqlOutputFile.close()
return resolve()
.on("close", () => {
return resolve(wikipediaPages)
})
})
}
await cleanPagesSQL()
const wikipediaPages = await cleanPagesSQL()
const cleanPagesSQLWriteToFile = async () => {
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
const wikipediaPagesString = Object.entries(wikipediaPages)
.map(([title, id]) => {
return `(${id},${title})`
})
.join(",")
await fs.promises.writeFile(
sqlOutputPath,
`${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
{ encoding: "utf-8" },
)
}
await cleanPagesSQLWriteToFile()

View File

@ -1,79 +0,0 @@
#!/usr/bin/env bash
# Usage: ./database-wikipedia.sh
# Description: Download and extract Wikipedia database dumps.
set -o errexit
set -o nounset
set -o pipefail
DUMP_DIRECTORY="dump"
SQL_OUTPUT_DIRECTORY="sql"
DOWNLOAD_DATE="latest"
WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
mkdir --parents "${DUMP_DIRECTORY}"
download_file() {
local filename="${1}"
local file_path_output="${DUMP_DIRECTORY}/${filename}"
local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
if [[ ! -f "${file_path_output}" ]]; then
echo "Downloading \"${filename}\" from \"${file_url}\"..."
wget --output-document="${file_path_output}" "${file_url}"
else
echo "File \"${filename}\" from \"${file_url}\" already exists."
fi
}
# download_file "page.sql.gz"
# download_file "pagelinks.sql.gz"
extract_file() {
local filename="${1}"
local file_path_input="${DUMP_DIRECTORY}/${filename}"
local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
if [[ ! -f "${file_path_output}" ]]; then
echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
gzip --decompress "${file_path_input}"
# `--keep` flag to keep the original file, not needed here.
# gzip --decompress --keep "${file_path_input}"
else
echo "File \"${filename}\" already extracted."
fi
}
# extract_file "page.sql.gz"
# extract_file "pagelinks.sql.gz"
# Function to clean the `page.sql` file by:
# - Removing all lines that don't start with `INSERT INTO...`.
# - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to 0.
# - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
# - Replace 'INSERT INTO `page` VALUES' with 'INSERT INTO pages VALUES'.
# - Replace escape single quote `\'` in MySQL to the PostgreSQL version `''`.
# - Replace escape double quote `\"` in MySQL to the PostgreSQL version `"`.
# - Handle backslashes `\\` by replacing them with a single backslash `\` for PostgreSQL.
clean_pages_sql() {
local sql_input_file_directory="${1}"
local sql_input="${sql_input_file_directory}/page.sql"
local sql_output="${SQL_OUTPUT_DIRECTORY}/2-pages-inserts.sql"
sed --quiet '/^INSERT INTO `page` VALUES (/p' "${sql_input}" |
grep -oP "INSERT INTO \`page\` VALUES \(.+?\);" |
sed 's/),(/)\n(/g' |
grep -P "\([0-9]+,0,'.*?',0" |
sed -E "s/^\(([0-9]+),0,'([^']*)',0.*\)$/\1,'\2'/" |
sed "s/\\\'/''/g" | # Replace escaped single quotes
sed 's/\\"/"/g' | # Replace escaped double quotes
sed 's/\\\\/\\/g' | # Replace double backslashes with a single backslash
awk 'BEGIN {print "INSERT INTO pages VALUES"} {print "(" $0 "),"}' |
sed '$ s/,$/;/g' >"$sql_output"
echo "Cleaned \"${sql_input}\" to \"${sql_output}\"."
}
# clean_pages_sql "${DUMP_DIRECTORY}"

49
data/download-wikipedia-dump.sh Executable file
View File

@ -0,0 +1,49 @@
#!/usr/bin/env bash
# Usage: ./download-wikipedia-dump.sh
# Description: Download and extract Wikipedia database dumps.
set -o errexit
set -o nounset
set -o pipefail
DUMP_DIRECTORY="dump"
DOWNLOAD_DATE="latest"
WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
mkdir --parents "${DUMP_DIRECTORY}"
download_file() {
local filename="${1}"
local file_path_output="${DUMP_DIRECTORY}/${filename}"
local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
if [[ ! -f "${file_path_output}" ]]; then
echo "Downloading \"${filename}\" from \"${file_url}\"..."
wget --output-document="${file_path_output}" "${file_url}"
else
echo "File \"${filename}\" from \"${file_url}\" already exists."
fi
}
download_file "page.sql.gz"
download_file "pagelinks.sql.gz"
extract_file() {
local filename="${1}"
local file_path_input="${DUMP_DIRECTORY}/${filename}"
local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
if [[ ! -f "${file_path_output}" ]]; then
echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
gzip --decompress "${file_path_input}"
# `--keep` flag to keep the original file, not needed here.
# gzip --decompress --keep "${file_path_input}"
else
echo "File \"${filename}\" already extracted."
fi
}
extract_file "page.sql.gz"
extract_file "pagelinks.sql.gz"

View File

@ -1,4 +0,0 @@
SET AUTOCOMMIT = 0;
SET FOREIGN_KEY_CHECKS = 0;
SET UNIQUE_CHECKS = 0;
BEGIN;

View File

@ -1,4 +0,0 @@
COMMIT;
SET AUTOCOMMIT = 1;
SET FOREIGN_KEY_CHECKS = 1;
SET UNIQUE_CHECKS = 1;

View File

@ -1,48 +0,0 @@
import { extractRowsFromSQLValues } from "./utils.js"
console.log(
"output:",
extractRowsFromSQLValues("(1,'-)',0),(2,'Demographics_of_American_Samoa',0)"),
)
console.log(
"output:",
extractRowsFromSQLValues(
`(1,'-d\\'ff)',0),(2,'Demographics_of_American_Samoa',0)`,
),
)
console.log(
"output:",
extractRowsFromSQLValues(
"(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11,'abc',ddf,123,43,'dff'",
),
)
console.log(
"output:",
extractRowsFromSQLValues(
"(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11",
),
)
console.log(
"output:",
extractRowsFromSQLValues(
"(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(",
),
)
console.log(
"output:",
extractRowsFromSQLValues(
`(1,'-)',0),(2,'C\\\\',1,0),(2,'Demographics_of_American_Samoa',0)`,
),
)
console.log(
"output:",
extractRowsFromSQLValues(
`(1,'-)',0),(2,'Good_Singin\\',_Good_Playin\\'',1,0),(2,'Demographics_of_American_Samoa',0)`,
),
)