#!/usr/bin/env bash # Usage: ./database-wikipedia.sh # Description: Download and extract Wikipedia database dumps. set -o errexit set -o nounset set -o pipefail DUMP_DIRECTORY="dump" SQL_OUTPUT_DIRECTORY="sql" DOWNLOAD_DATE="latest" WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-" mkdir --parents "${DUMP_DIRECTORY}" download_file() { local filename="${1}" local file_path_output="${DUMP_DIRECTORY}/${filename}" local file_url="${WIKIPEDIA_DUMP_URL}${filename}" if [[ ! -f "${file_path_output}" ]]; then echo "Downloading \"${filename}\" from \"${file_url}\"..." wget --output-document="${file_path_output}" "${file_url}" else echo "File \"${filename}\" from \"${file_url}\" already exists." fi } # download_file "page.sql.gz" # download_file "pagelinks.sql.gz" extract_file() { local filename="${1}" local file_path_input="${DUMP_DIRECTORY}/${filename}" local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}" if [[ ! -f "${file_path_output}" ]]; then echo "Extracting \"${filename}\" to \"${file_path_output}\"..." gzip --decompress "${file_path_input}" # `--keep` flag to keep the original file, not needed here. # gzip --decompress --keep "${file_path_input}" else echo "File \"${filename}\" already extracted." fi } # extract_file "page.sql.gz" # extract_file "pagelinks.sql.gz" # Function to clean the `page.sql` file by: # - Removing all lines that don't start with `INSERT INTO...`. # - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to 0. # - Only keep columns `page_id` (1st column) and `page_title` (3rd column). # - Replace 'INSERT INTO `page` VALUES' with 'INSERT INTO pages VALUES'. # - Replace escape single quote `\'` in MySQL to the PostgreSQL version `''`. # - Replace escape double quote `\"` in MySQL to the PostgreSQL version `"`. # - Handle backslashes `\\` by replacing them with a single backslash `\` for PostgreSQL. clean_pages_sql() { local sql_input_file_directory="${1}" local sql_input="${sql_input_file_directory}/page.sql" local sql_output="${SQL_OUTPUT_DIRECTORY}/2-pages-inserts.sql" sed --quiet '/^INSERT INTO `page` VALUES (/p' "${sql_input}" | grep -oP "INSERT INTO \`page\` VALUES \(.+?\);" | sed 's/),(/)\n(/g' | grep -P "\([0-9]+,0,'.*?',0" | sed -E "s/^\(([0-9]+),0,'([^']*)',0.*\)$/\1,'\2'/" | sed "s/\\\'/''/g" | # Replace escaped single quotes sed 's/\\"/"/g' | # Replace escaped double quotes sed 's/\\\\/\\/g' | # Replace double backslashes with a single backslash awk 'BEGIN {print "INSERT INTO pages VALUES"} {print "(" $0 "),"}' | sed '$ s/,$/;/g' >"$sql_output" echo "Cleaned \"${sql_input}\" to \"${sql_output}\"." } # clean_pages_sql "${DUMP_DIRECTORY}"