chore: clean up POC to get Wikipedia dump
This commit is contained in:
parent
3de838dded
commit
61914d2392
7
TODO.md
7
TODO.md
@ -12,11 +12,12 @@
|
|||||||
- [x] `page.sql` (`pages` tables)
|
- [x] `page.sql` (`pages` tables)
|
||||||
- [ ] `pagelinks.sql` (`internal_links` tables)
|
- [ ] `pagelinks.sql` (`internal_links` tables)
|
||||||
- [x] Import SQL files
|
- [x] Import SQL files
|
||||||
- [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version
|
|
||||||
- [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page`
|
- [ ] Try `SELECT count(*) FROM internal_links il WHERE il.source_id = (SELECT p.id FROM pages WHERE p.title = 'Linux'); -- Count of internal links for 'Linux' page`
|
||||||
- [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder
|
- [ ] Move from POC (Proof of concept) in `data` folder to `apps/cli` folder
|
||||||
- [ ] `.gitignore` correctly + Documentation how to use + Last execution date
|
- [ ] Documentation how to use + Last execution date
|
||||||
- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/))
|
- [ ] Rewrite bash script to download and extract SQL files from Wikipedia Database Dump to Node.js for better cross-platform support and easier maintenance + automation, preferably one Node.js script to generate everything to create the database
|
||||||
|
- [ ] Verify file content up to before inserts, to check if it matches last version, and diff with last version
|
||||||
|
- [ ] Implement REST API (`api`) with JSON responses ([AdonisJS](https://adonisjs.com/)) to get shortest paths between 2 pages
|
||||||
- [ ] Implement Wikipedia Game Solver (`website`) with inputs, button to submit, and list all pages to go from one to another, or none if it is not possible
|
- [ ] Implement Wikipedia Game Solver (`website`) with inputs, button to submit, and list all pages to go from one to another, or none if it is not possible
|
||||||
- [ ] Check how to deal with redirects (+ Wikipedia Database Dump related)
|
- [ ] Check how to deal with redirects (+ Wikipedia Database Dump related)
|
||||||
- [ ] Implement toast notifications for errors, warnings, and success messages
|
- [ ] Implement toast notifications for errors, warnings, and success messages
|
||||||
|
@ -4,27 +4,19 @@
|
|||||||
|
|
||||||
Show the first 10 line of sql file: `head -n 10 ./dump/page.sql`
|
Show the first 10 line of sql file: `head -n 10 ./dump/page.sql`
|
||||||
|
|
||||||
To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-postgres-data'`
|
To inspect volume size used by database: `docker system df -v | grep 'wikipedia-solver-mariadb-data'`
|
||||||
|
|
||||||
## PostgreSQL related
|
## Remove a volume
|
||||||
|
|
||||||
### Import SQL file to PostgreSQL Docker Container
|
|
||||||
|
|
||||||
In `compose.yaml`, we can specify SQL scripts to be executed when the container starts for the first time.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
volumes:
|
|
||||||
- "./sql:/docker-entrypoint-initdb.d/"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Remove a volume
|
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# List all volumes
|
# List all volumes
|
||||||
docker volume ls
|
docker volume ls
|
||||||
|
|
||||||
# Remove a volume
|
# Remove a volume
|
||||||
docker volume rm data_wikipedia-solver-postgres-data
|
docker volume rm data_wikipedia-solver-mariadb-data
|
||||||
|
|
||||||
|
# Or by using docker compose down
|
||||||
|
docker-compose down --volumes
|
||||||
```
|
```
|
||||||
|
|
||||||
## MySQL Related
|
## MySQL Related
|
||||||
@ -96,32 +88,4 @@ CREATE TABLE `page` (
|
|||||||
--
|
--
|
||||||
-- Dumping data for table `page`
|
-- Dumping data for table `page`
|
||||||
--
|
--
|
||||||
|
|
||||||
/*!40000 ALTER TABLE `page` DISABLE KEYS */;
|
|
||||||
INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL);
|
|
||||||
|
|
||||||
INSERT INTO `page` VALUES (10,0,'AccessibleComputing',1,0,0.856935107283,'20240722211426','20240722220435',1219062925,111,'wikitext',NULL),(12,0,'Anarchism',0,0,0.786172332974311,'20240731234111','20240731234202',1234495258,110759,'wikitext',NULL),(12281,0,'Gottfried_Wilhelm_Leibniz',0,0,0.79151204115852,'20240731234133','20240731234228',1237687724,155319,'wikitext',NULL),(13,0,'AfghanistanHistory',1,0,0.154661929211,'20240729123940','20240722220436',783865149,90,'wikitext',NULL),(14,0,'AfghanistanGeography',1,0,0.952234464653055,'20240722211426','20240722220436',783865160,92,'wikitext',NULL),(15,0,'AfghanistanPeople',1,0,0.047716566551,'20240722211426','20240722220436',783865293,95,'wikitext',NULL),(12473,1,'Gnosticism',0,0,0.00653186720472934,'20240801075011','20240731232236',1233717868,6579,'wikitext',NULL);
|
|
||||||
-- Expected output: INSERT INTO `page` VALUES (12,'Anarchism'),(12281,'Gottfried_Wilhelm_Leibniz');
|
|
||||||
```
|
|
||||||
|
|
||||||
### PostgreSQL short version
|
|
||||||
|
|
||||||
```sql
|
|
||||||
CREATE TABLE IF NOT EXISTS pages (
|
|
||||||
id BIGSERIAL PRIMARY KEY,
|
|
||||||
title VARCHAR(255) UNIQUE NOT NULL
|
|
||||||
|
|
||||||
-- is_redirect BOOLEAN NOT NULL DEFAULT FALSE
|
|
||||||
);
|
|
||||||
|
|
||||||
-- Examples of inserts
|
|
||||||
INSERT INTO pages VALUES (10, 'AccessibleComputing'); -- (is_redirect = true)
|
|
||||||
INSERT INTO pages VALUES (10474, 'Eight_queens_puzzle'); -- (is_redirect = false)
|
|
||||||
|
|
||||||
INSERT INTO pages VALUES
|
|
||||||
(10,'AccessibleComputing'),
|
|
||||||
(12,'Anarchism'),
|
|
||||||
(13,'AfghanistanHistory'),
|
|
||||||
(14,'AfghanistanGeography'),
|
|
||||||
(15,'AfghanistanPeople');
|
|
||||||
```
|
```
|
||||||
|
@ -1,17 +1,4 @@
|
|||||||
services:
|
services:
|
||||||
# wikipedia-solver-database:
|
|
||||||
# container_name: "wikipedia-solver-database"
|
|
||||||
# image: "postgres:16.3"
|
|
||||||
# restart: "unless-stopped"
|
|
||||||
# env_file: ".env"
|
|
||||||
# environment:
|
|
||||||
# POSTGRES_USER: ${DATABASE_USER}
|
|
||||||
# POSTGRES_PASSWORD: ${DATABASE_PASSWORD}
|
|
||||||
# POSTGRES_DB: ${DATABASE_NAME}
|
|
||||||
# volumes:
|
|
||||||
# - "wikipedia-solver-postgres-data:/var/lib/postgresql/data"
|
|
||||||
# - "./sql:/docker-entrypoint-initdb.d/"
|
|
||||||
|
|
||||||
wikipedia-solver-database:
|
wikipedia-solver-database:
|
||||||
container_name: "wikipedia-solver-database"
|
container_name: "wikipedia-solver-database"
|
||||||
image: "mariadb:10.6.17"
|
image: "mariadb:10.6.17"
|
||||||
@ -47,25 +34,5 @@ services:
|
|||||||
- "./adminer/logo.png:/var/www/html/logo.png"
|
- "./adminer/logo.png:/var/www/html/logo.png"
|
||||||
- "./adminer/fonts/:/var/www/html/fonts"
|
- "./adminer/fonts/:/var/www/html/fonts"
|
||||||
|
|
||||||
# dbgate:
|
|
||||||
# image: "dbgate/dbgate:5.3.3"
|
|
||||||
# restart: "always"
|
|
||||||
# ports:
|
|
||||||
# - "8080:3000"
|
|
||||||
# volumes:
|
|
||||||
# - "dbgate-data:/root/.dbgate"
|
|
||||||
# environment:
|
|
||||||
# CONNECTIONS: "con1"
|
|
||||||
|
|
||||||
# LABEL_con1: "Postgres"
|
|
||||||
# SERVER_con1: "wikipedia-solver-database"
|
|
||||||
# USER_con1: ${DATABASE_USER}
|
|
||||||
# PASSWORD_con1: ${DATABASE_PASSWORD}
|
|
||||||
# PORT_con1: 5432
|
|
||||||
# ENGINE_con1: "postgres@dbgate-plugin-postgres"
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
wikipedia-solver-mariadb-data:
|
wikipedia-solver-mariadb-data:
|
||||||
# wikipedia-solver-postgres-data:
|
|
||||||
# dbgate-data:
|
|
||||||
# driver: "local"
|
|
||||||
|
@ -1,113 +0,0 @@
|
|||||||
import fs from "node:fs"
|
|
||||||
import path from "node:path"
|
|
||||||
import { extractRowsFromSQLValues } from "./utils.js"
|
|
||||||
|
|
||||||
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
|
|
||||||
const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @typedef {Record<string, number>} WikipediaPages
|
|
||||||
*
|
|
||||||
* Object to store pages from Wikipedia:
|
|
||||||
* - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
|
|
||||||
* - Value: page id.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Function to clean the `page.sql` file by:
|
|
||||||
* - Removing all lines that don't start with `INSERT INTO...`.
|
|
||||||
* - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
|
|
||||||
* - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
|
|
||||||
* @returns {Promise<WikipediaPages>}
|
|
||||||
*/
|
|
||||||
const cleanPagesSQL = async () => {
|
|
||||||
/** @type {WikipediaPages} */
|
|
||||||
const wikipediaPages = {}
|
|
||||||
|
|
||||||
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
|
|
||||||
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
|
|
||||||
const sqlInputStat = await fs.promises.stat(sqlInputPath)
|
|
||||||
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
|
|
||||||
|
|
||||||
let isInsideInsert = false
|
|
||||||
let current = ""
|
|
||||||
let lastPercent = 0
|
|
||||||
|
|
||||||
return await new Promise((resolve, reject) => {
|
|
||||||
sqlInputFileStream
|
|
||||||
.on("data", (dataInput) => {
|
|
||||||
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
|
|
||||||
const bytesReadPercent = bytesReadRatio * 100
|
|
||||||
|
|
||||||
if (bytesReadPercent - lastPercent >= 1) {
|
|
||||||
console.log(
|
|
||||||
`Bytes read (${bytesReadPercent.toFixed(2)}%): ${sqlInputFileStream.bytesRead} / ${sqlInputStat.size}`,
|
|
||||||
)
|
|
||||||
lastPercent = bytesReadPercent
|
|
||||||
}
|
|
||||||
|
|
||||||
let data = current + dataInput
|
|
||||||
|
|
||||||
if (!isInsideInsert) {
|
|
||||||
const lines = data.split("\n").filter((line) => {
|
|
||||||
return line.startsWith(INSERT_INTO_START_INPUT)
|
|
||||||
})
|
|
||||||
const [line] = lines
|
|
||||||
if (line == null) {
|
|
||||||
sqlInputFileStream.close()
|
|
||||||
return reject(new Error(`No "${INSERT_INTO_START_INPUT}" found.`))
|
|
||||||
}
|
|
||||||
isInsideInsert = true
|
|
||||||
const lineStripped = line.slice(INSERT_INTO_START_INPUT.length)
|
|
||||||
data = lineStripped
|
|
||||||
}
|
|
||||||
|
|
||||||
const { rows, unCompleted } = extractRowsFromSQLValues(data)
|
|
||||||
current = unCompleted
|
|
||||||
|
|
||||||
for (const row of rows) {
|
|
||||||
if (row.length !== 12) {
|
|
||||||
sqlInputFileStream.close()
|
|
||||||
console.error([row])
|
|
||||||
return reject(new Error(`Invalid Row values.`))
|
|
||||||
}
|
|
||||||
|
|
||||||
const id = Number.parseInt(row[0] ?? "0", 10)
|
|
||||||
const namespace = row[1] ?? ""
|
|
||||||
const title = row[2] ?? ""
|
|
||||||
const isRedirect = row[3] === "1"
|
|
||||||
|
|
||||||
if (namespace === "0" && !isRedirect) {
|
|
||||||
wikipediaPages[title] = id
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.on("error", (error) => {
|
|
||||||
return reject(error)
|
|
||||||
})
|
|
||||||
.on("close", () => {
|
|
||||||
return resolve(wikipediaPages)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
const wikipediaPages = await cleanPagesSQL()
|
|
||||||
|
|
||||||
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
|
|
||||||
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
|
|
||||||
|
|
||||||
const wikipediaPagesString = Object.entries(wikipediaPages)
|
|
||||||
.map(([title, id]) => {
|
|
||||||
return `(${id},${title})`
|
|
||||||
})
|
|
||||||
.join(",")
|
|
||||||
|
|
||||||
await fs.promises.writeFile(
|
|
||||||
sqlOutputPath,
|
|
||||||
`${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
|
|
||||||
{ encoding: "utf-8" },
|
|
||||||
)
|
|
||||||
|
|
||||||
// const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
|
|
||||||
// await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT)
|
|
||||||
// await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8")
|
|
@ -5,22 +5,29 @@ import { extractRowsFromSQLValues } from "./utils.js"
|
|||||||
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
|
const SQL_DUMP_PATH = path.join(process.cwd(), "dump")
|
||||||
const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
|
const SQL_OUTPUT_PATH = path.join(process.cwd(), "sql")
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @typedef {Record<string, number>} WikipediaPages
|
||||||
|
*
|
||||||
|
* Object to store pages from Wikipedia:
|
||||||
|
* - Key: page title sanitized - The real title shown is this title with underscores (_) converted to spaces ( ).
|
||||||
|
* - Value: page id.
|
||||||
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function to clean the `page.sql` file by:
|
* Function to clean the `page.sql` file by:
|
||||||
* - Removing all lines that don't start with `INSERT INTO...`.
|
* - Removing all lines that don't start with `INSERT INTO...`.
|
||||||
* - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
|
* - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to false (0).
|
||||||
* - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
|
* - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
|
||||||
|
* @returns {Promise<WikipediaPages>}
|
||||||
*/
|
*/
|
||||||
const cleanPagesSQL = async () => {
|
const cleanPagesSQL = async () => {
|
||||||
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
|
/** @type {WikipediaPages} */
|
||||||
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES\n"
|
const wikipediaPages = {}
|
||||||
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
|
|
||||||
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
|
|
||||||
|
|
||||||
|
const INSERT_INTO_START_INPUT = "INSERT INTO `page` VALUES "
|
||||||
|
const sqlInputPath = path.join(SQL_DUMP_PATH, "page.sql")
|
||||||
const sqlInputStat = await fs.promises.stat(sqlInputPath)
|
const sqlInputStat = await fs.promises.stat(sqlInputPath)
|
||||||
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
|
const sqlInputFileStream = fs.createReadStream(sqlInputPath, "utf-8")
|
||||||
const sqlOutputFile = await fs.promises.open(sqlOutputPath, "w")
|
|
||||||
await sqlOutputFile.appendFile(INSERT_INTO_START_OUTPUT)
|
|
||||||
|
|
||||||
let isInsideInsert = false
|
let isInsideInsert = false
|
||||||
let current = ""
|
let current = ""
|
||||||
@ -28,7 +35,7 @@ const cleanPagesSQL = async () => {
|
|||||||
|
|
||||||
return await new Promise((resolve, reject) => {
|
return await new Promise((resolve, reject) => {
|
||||||
sqlInputFileStream
|
sqlInputFileStream
|
||||||
.on("data", async (dataInput) => {
|
.on("data", (dataInput) => {
|
||||||
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
|
const bytesReadRatio = sqlInputFileStream.bytesRead / sqlInputStat.size
|
||||||
const bytesReadPercent = bytesReadRatio * 100
|
const bytesReadPercent = bytesReadRatio * 100
|
||||||
|
|
||||||
@ -39,9 +46,6 @@ const cleanPagesSQL = async () => {
|
|||||||
lastPercent = bytesReadPercent
|
lastPercent = bytesReadPercent
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @type {string}
|
|
||||||
*/
|
|
||||||
let data = current + dataInput
|
let data = current + dataInput
|
||||||
|
|
||||||
if (!isInsideInsert) {
|
if (!isInsideInsert) {
|
||||||
@ -74,21 +78,36 @@ const cleanPagesSQL = async () => {
|
|||||||
const isRedirect = row[3] === "1"
|
const isRedirect = row[3] === "1"
|
||||||
|
|
||||||
if (namespace === "0" && !isRedirect) {
|
if (namespace === "0" && !isRedirect) {
|
||||||
await sqlOutputFile.appendFile(`(${id},${title}),\n`, "utf-8")
|
wikipediaPages[title] = id
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.on("error", async (error) => {
|
.on("error", (error) => {
|
||||||
await sqlOutputFile.close()
|
|
||||||
return reject(error)
|
return reject(error)
|
||||||
})
|
})
|
||||||
.on("close", async () => {
|
.on("close", () => {
|
||||||
console.log(`Cleaned "${sqlInputPath}" to "${sqlOutputPath}".`)
|
return resolve(wikipediaPages)
|
||||||
await sqlOutputFile.appendFile(";\n", "utf-8")
|
|
||||||
await sqlOutputFile.close()
|
|
||||||
return resolve()
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
await cleanPagesSQL()
|
const wikipediaPages = await cleanPagesSQL()
|
||||||
|
|
||||||
|
const cleanPagesSQLWriteToFile = async () => {
|
||||||
|
const sqlOutputPath = path.join(SQL_OUTPUT_PATH, "2-pages-inserts.sql")
|
||||||
|
const INSERT_INTO_START_OUTPUT = "INSERT INTO pages VALUES "
|
||||||
|
|
||||||
|
const wikipediaPagesString = Object.entries(wikipediaPages)
|
||||||
|
.map(([title, id]) => {
|
||||||
|
return `(${id},${title})`
|
||||||
|
})
|
||||||
|
.join(",")
|
||||||
|
|
||||||
|
await fs.promises.writeFile(
|
||||||
|
sqlOutputPath,
|
||||||
|
`${INSERT_INTO_START_OUTPUT}${wikipediaPagesString};`,
|
||||||
|
{ encoding: "utf-8" },
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
await cleanPagesSQLWriteToFile()
|
||||||
|
@ -1,79 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Usage: ./database-wikipedia.sh
|
|
||||||
# Description: Download and extract Wikipedia database dumps.
|
|
||||||
|
|
||||||
set -o errexit
|
|
||||||
set -o nounset
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
DUMP_DIRECTORY="dump"
|
|
||||||
SQL_OUTPUT_DIRECTORY="sql"
|
|
||||||
DOWNLOAD_DATE="latest"
|
|
||||||
WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
|
|
||||||
|
|
||||||
mkdir --parents "${DUMP_DIRECTORY}"
|
|
||||||
|
|
||||||
download_file() {
|
|
||||||
local filename="${1}"
|
|
||||||
local file_path_output="${DUMP_DIRECTORY}/${filename}"
|
|
||||||
local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
|
|
||||||
|
|
||||||
if [[ ! -f "${file_path_output}" ]]; then
|
|
||||||
echo "Downloading \"${filename}\" from \"${file_url}\"..."
|
|
||||||
wget --output-document="${file_path_output}" "${file_url}"
|
|
||||||
else
|
|
||||||
echo "File \"${filename}\" from \"${file_url}\" already exists."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# download_file "page.sql.gz"
|
|
||||||
# download_file "pagelinks.sql.gz"
|
|
||||||
|
|
||||||
extract_file() {
|
|
||||||
local filename="${1}"
|
|
||||||
local file_path_input="${DUMP_DIRECTORY}/${filename}"
|
|
||||||
local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
|
|
||||||
|
|
||||||
if [[ ! -f "${file_path_output}" ]]; then
|
|
||||||
echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
|
|
||||||
gzip --decompress "${file_path_input}"
|
|
||||||
|
|
||||||
# `--keep` flag to keep the original file, not needed here.
|
|
||||||
# gzip --decompress --keep "${file_path_input}"
|
|
||||||
else
|
|
||||||
echo "File \"${filename}\" already extracted."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# extract_file "page.sql.gz"
|
|
||||||
# extract_file "pagelinks.sql.gz"
|
|
||||||
|
|
||||||
# Function to clean the `page.sql` file by:
|
|
||||||
# - Removing all lines that don't start with `INSERT INTO...`.
|
|
||||||
# - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to 0.
|
|
||||||
# - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
|
|
||||||
# - Replace 'INSERT INTO `page` VALUES' with 'INSERT INTO pages VALUES'.
|
|
||||||
# - Replace escape single quote `\'` in MySQL to the PostgreSQL version `''`.
|
|
||||||
# - Replace escape double quote `\"` in MySQL to the PostgreSQL version `"`.
|
|
||||||
# - Handle backslashes `\\` by replacing them with a single backslash `\` for PostgreSQL.
|
|
||||||
clean_pages_sql() {
|
|
||||||
local sql_input_file_directory="${1}"
|
|
||||||
local sql_input="${sql_input_file_directory}/page.sql"
|
|
||||||
local sql_output="${SQL_OUTPUT_DIRECTORY}/2-pages-inserts.sql"
|
|
||||||
|
|
||||||
sed --quiet '/^INSERT INTO `page` VALUES (/p' "${sql_input}" |
|
|
||||||
grep -oP "INSERT INTO \`page\` VALUES \(.+?\);" |
|
|
||||||
sed 's/),(/)\n(/g' |
|
|
||||||
grep -P "\([0-9]+,0,'.*?',0" |
|
|
||||||
sed -E "s/^\(([0-9]+),0,'([^']*)',0.*\)$/\1,'\2'/" |
|
|
||||||
sed "s/\\\'/''/g" | # Replace escaped single quotes
|
|
||||||
sed 's/\\"/"/g' | # Replace escaped double quotes
|
|
||||||
sed 's/\\\\/\\/g' | # Replace double backslashes with a single backslash
|
|
||||||
awk 'BEGIN {print "INSERT INTO pages VALUES"} {print "(" $0 "),"}' |
|
|
||||||
sed '$ s/,$/;/g' >"$sql_output"
|
|
||||||
|
|
||||||
echo "Cleaned \"${sql_input}\" to \"${sql_output}\"."
|
|
||||||
}
|
|
||||||
|
|
||||||
# clean_pages_sql "${DUMP_DIRECTORY}"
|
|
49
data/download-wikipedia-dump.sh
Executable file
49
data/download-wikipedia-dump.sh
Executable file
@ -0,0 +1,49 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Usage: ./download-wikipedia-dump.sh
|
||||||
|
# Description: Download and extract Wikipedia database dumps.
|
||||||
|
|
||||||
|
set -o errexit
|
||||||
|
set -o nounset
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
DUMP_DIRECTORY="dump"
|
||||||
|
DOWNLOAD_DATE="latest"
|
||||||
|
WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
|
||||||
|
|
||||||
|
mkdir --parents "${DUMP_DIRECTORY}"
|
||||||
|
|
||||||
|
download_file() {
|
||||||
|
local filename="${1}"
|
||||||
|
local file_path_output="${DUMP_DIRECTORY}/${filename}"
|
||||||
|
local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
|
||||||
|
|
||||||
|
if [[ ! -f "${file_path_output}" ]]; then
|
||||||
|
echo "Downloading \"${filename}\" from \"${file_url}\"..."
|
||||||
|
wget --output-document="${file_path_output}" "${file_url}"
|
||||||
|
else
|
||||||
|
echo "File \"${filename}\" from \"${file_url}\" already exists."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
download_file "page.sql.gz"
|
||||||
|
download_file "pagelinks.sql.gz"
|
||||||
|
|
||||||
|
extract_file() {
|
||||||
|
local filename="${1}"
|
||||||
|
local file_path_input="${DUMP_DIRECTORY}/${filename}"
|
||||||
|
local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
|
||||||
|
|
||||||
|
if [[ ! -f "${file_path_output}" ]]; then
|
||||||
|
echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
|
||||||
|
gzip --decompress "${file_path_input}"
|
||||||
|
|
||||||
|
# `--keep` flag to keep the original file, not needed here.
|
||||||
|
# gzip --decompress --keep "${file_path_input}"
|
||||||
|
else
|
||||||
|
echo "File \"${filename}\" already extracted."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
extract_file "page.sql.gz"
|
||||||
|
extract_file "pagelinks.sql.gz"
|
@ -1,4 +0,0 @@
|
|||||||
SET AUTOCOMMIT = 0;
|
|
||||||
SET FOREIGN_KEY_CHECKS = 0;
|
|
||||||
SET UNIQUE_CHECKS = 0;
|
|
||||||
BEGIN;
|
|
@ -1,4 +0,0 @@
|
|||||||
COMMIT;
|
|
||||||
SET AUTOCOMMIT = 1;
|
|
||||||
SET FOREIGN_KEY_CHECKS = 1;
|
|
||||||
SET UNIQUE_CHECKS = 1;
|
|
48
data/test.js
48
data/test.js
@ -1,48 +0,0 @@
|
|||||||
import { extractRowsFromSQLValues } from "./utils.js"
|
|
||||||
|
|
||||||
console.log(
|
|
||||||
"output:",
|
|
||||||
extractRowsFromSQLValues("(1,'-)',0),(2,'Demographics_of_American_Samoa',0)"),
|
|
||||||
)
|
|
||||||
|
|
||||||
console.log(
|
|
||||||
"output:",
|
|
||||||
extractRowsFromSQLValues(
|
|
||||||
`(1,'-d\\'ff)',0),(2,'Demographics_of_American_Samoa',0)`,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
console.log(
|
|
||||||
"output:",
|
|
||||||
extractRowsFromSQLValues(
|
|
||||||
"(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11,'abc',ddf,123,43,'dff'",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
console.log(
|
|
||||||
"output:",
|
|
||||||
extractRowsFromSQLValues(
|
|
||||||
"(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(11",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
console.log(
|
|
||||||
"output:",
|
|
||||||
extractRowsFromSQLValues(
|
|
||||||
"(1,'-)',0),(2,'Demographics_of_American_Samoa',0),(",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
console.log(
|
|
||||||
"output:",
|
|
||||||
extractRowsFromSQLValues(
|
|
||||||
`(1,'-)',0),(2,'C:\\\\',1,0),(2,'Demographics_of_American_Samoa',0)`,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
console.log(
|
|
||||||
"output:",
|
|
||||||
extractRowsFromSQLValues(
|
|
||||||
`(1,'-)',0),(2,'Good_Singin\\',_Good_Playin\\'',1,0),(2,'Demographics_of_American_Samoa',0)`,
|
|
||||||
),
|
|
||||||
)
|
|
Reference in New Issue
Block a user