80 lines
2.8 KiB
Bash
80 lines
2.8 KiB
Bash
|
#!/usr/bin/env bash
|
||
|
|
||
|
# Usage: ./database-wikipedia.sh
|
||
|
# Description: Download and extract Wikipedia database dumps.
|
||
|
|
||
|
set -o errexit
|
||
|
set -o nounset
|
||
|
set -o pipefail
|
||
|
|
||
|
DUMP_DIRECTORY="dump"
|
||
|
SQL_OUTPUT_DIRECTORY="sql"
|
||
|
DOWNLOAD_DATE="latest"
|
||
|
WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
|
||
|
|
||
|
mkdir --parents "${DUMP_DIRECTORY}"
|
||
|
|
||
|
download_file() {
|
||
|
local filename="${1}"
|
||
|
local file_path_output="${DUMP_DIRECTORY}/${filename}"
|
||
|
local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
|
||
|
|
||
|
if [[ ! -f "${file_path_output}" ]]; then
|
||
|
echo "Downloading \"${filename}\" from \"${file_url}\"..."
|
||
|
wget --output-document="${file_path_output}" "${file_url}"
|
||
|
else
|
||
|
echo "File \"${filename}\" from \"${file_url}\" already exists."
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
# download_file "page.sql.gz"
|
||
|
# download_file "pagelinks.sql.gz"
|
||
|
|
||
|
extract_file() {
|
||
|
local filename="${1}"
|
||
|
local file_path_input="${DUMP_DIRECTORY}/${filename}"
|
||
|
local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
|
||
|
|
||
|
if [[ ! -f "${file_path_output}" ]]; then
|
||
|
echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
|
||
|
gzip --decompress "${file_path_input}"
|
||
|
|
||
|
# `--keep` flag to keep the original file, not needed here.
|
||
|
# gzip --decompress --keep "${file_path_input}"
|
||
|
else
|
||
|
echo "File \"${filename}\" already extracted."
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
# extract_file "page.sql.gz"
|
||
|
# extract_file "pagelinks.sql.gz"
|
||
|
|
||
|
# Function to clean the `page.sql` file by:
|
||
|
# - Removing all lines that don't start with `INSERT INTO...`.
|
||
|
# - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to 0.
|
||
|
# - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
|
||
|
# - Replace 'INSERT INTO `page` VALUES' with 'INSERT INTO pages VALUES'.
|
||
|
# - Replace escape single quote `\'` in MySQL to the PostgreSQL version `''`.
|
||
|
# - Replace escape double quote `\"` in MySQL to the PostgreSQL version `"`.
|
||
|
# - Handle backslashes `\\` by replacing them with a single backslash `\` for PostgreSQL.
|
||
|
clean_pages_sql() {
|
||
|
local sql_input_file_directory="${1}"
|
||
|
local sql_input="${sql_input_file_directory}/page.sql"
|
||
|
local sql_output="${SQL_OUTPUT_DIRECTORY}/2-pages-inserts.sql"
|
||
|
|
||
|
sed --quiet '/^INSERT INTO `page` VALUES (/p' "${sql_input}" |
|
||
|
grep -oP "INSERT INTO \`page\` VALUES \(.+?\);" |
|
||
|
sed 's/),(/)\n(/g' |
|
||
|
grep -P "\([0-9]+,0,'.*?',0" |
|
||
|
sed -E "s/^\(([0-9]+),0,'([^']*)',0.*\)$/\1,'\2'/" |
|
||
|
sed "s/\\\'/''/g" | # Replace escaped single quotes
|
||
|
sed 's/\\"/"/g' | # Replace escaped double quotes
|
||
|
sed 's/\\\\/\\/g' | # Replace double backslashes with a single backslash
|
||
|
awk 'BEGIN {print "INSERT INTO pages VALUES"} {print "(" $0 "),"}' |
|
||
|
sed '$ s/,$/;/g' >"$sql_output"
|
||
|
|
||
|
echo "Cleaned \"${sql_input}\" to \"${sql_output}\"."
|
||
|
}
|
||
|
|
||
|
# clean_pages_sql "${DUMP_DIRECTORY}"
|