wikipedia-game-solver/data/database-wikipedia.sh

80 lines
2.8 KiB
Bash
Executable File

#!/usr/bin/env bash
# Usage: ./database-wikipedia.sh
# Description: Download and extract Wikipedia database dumps.
set -o errexit
set -o nounset
set -o pipefail
DUMP_DIRECTORY="dump"
SQL_OUTPUT_DIRECTORY="sql"
DOWNLOAD_DATE="latest"
WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
mkdir --parents "${DUMP_DIRECTORY}"
download_file() {
local filename="${1}"
local file_path_output="${DUMP_DIRECTORY}/${filename}"
local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
if [[ ! -f "${file_path_output}" ]]; then
echo "Downloading \"${filename}\" from \"${file_url}\"..."
wget --output-document="${file_path_output}" "${file_url}"
else
echo "File \"${filename}\" from \"${file_url}\" already exists."
fi
}
# download_file "page.sql.gz"
# download_file "pagelinks.sql.gz"
extract_file() {
local filename="${1}"
local file_path_input="${DUMP_DIRECTORY}/${filename}"
local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
if [[ ! -f "${file_path_output}" ]]; then
echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
gzip --decompress "${file_path_input}"
# `--keep` flag to keep the original file, not needed here.
# gzip --decompress --keep "${file_path_input}"
else
echo "File \"${filename}\" already extracted."
fi
}
# extract_file "page.sql.gz"
# extract_file "pagelinks.sql.gz"
# Function to clean the `page.sql` file by:
# - Removing all lines that don't start with `INSERT INTO...`.
# - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to 0.
# - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
# - Replace 'INSERT INTO `page` VALUES' with 'INSERT INTO pages VALUES'.
# - Replace escape single quote `\'` in MySQL to the PostgreSQL version `''`.
# - Replace escape double quote `\"` in MySQL to the PostgreSQL version `"`.
# - Handle backslashes `\\` by replacing them with a single backslash `\` for PostgreSQL.
clean_pages_sql() {
local sql_input_file_directory="${1}"
local sql_input="${sql_input_file_directory}/page.sql"
local sql_output="${SQL_OUTPUT_DIRECTORY}/2-pages-inserts.sql"
sed --quiet '/^INSERT INTO `page` VALUES (/p' "${sql_input}" |
grep -oP "INSERT INTO \`page\` VALUES \(.+?\);" |
sed 's/),(/)\n(/g' |
grep -P "\([0-9]+,0,'.*?',0" |
sed -E "s/^\(([0-9]+),0,'([^']*)',0.*\)$/\1,'\2'/" |
sed "s/\\\'/''/g" | # Replace escaped single quotes
sed 's/\\"/"/g' | # Replace escaped double quotes
sed 's/\\\\/\\/g' | # Replace double backslashes with a single backslash
awk 'BEGIN {print "INSERT INTO pages VALUES"} {print "(" $0 "),"}' |
sed '$ s/,$/;/g' >"$sql_output"
echo "Cleaned \"${sql_input}\" to \"${sql_output}\"."
}
# clean_pages_sql "${DUMP_DIRECTORY}"