wikipedia-game-solver/data/database-wikipedia.sh

#!/usr/bin/env bash

# Usage: ./database-wikipedia.sh
# Description: Download and extract Wikipedia database dumps.

set -o errexit
set -o nounset
set -o pipefail

DUMP_DIRECTORY="dump"
SQL_OUTPUT_DIRECTORY="sql"
DOWNLOAD_DATE="latest"
WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"

mkdir --parents "${DUMP_DIRECTORY}"

download_file() {
  local filename="${1}"
  local file_path_output="${DUMP_DIRECTORY}/${filename}"
  local file_url="${WIKIPEDIA_DUMP_URL}${filename}"

  if [[ ! -f "${file_path_output}" ]]; then
    echo "Downloading \"${filename}\" from \"${file_url}\"..."
    wget --output-document="${file_path_output}" "${file_url}"
  else
    echo "File \"${filename}\" from \"${file_url}\" already exists."
  fi
}

# download_file "page.sql.gz"
# download_file "pagelinks.sql.gz"

extract_file() {
  local filename="${1}"
  local file_path_input="${DUMP_DIRECTORY}/${filename}"
  local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"

  if [[ ! -f "${file_path_output}" ]]; then
    echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
    gzip --decompress "${file_path_input}"

    # `--keep` flag to keep the original file, not needed here.
    # gzip --decompress --keep "${file_path_input}"
  else
    echo "File \"${filename}\" already extracted."
  fi
}

# extract_file "page.sql.gz"
# extract_file "pagelinks.sql.gz"

# Function to clean the `page.sql` file by:
# - Removing all lines that don't start with `INSERT INTO...`.
# - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to 0.
# - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
# - Replace 'INSERT INTO `page` VALUES' with 'INSERT INTO pages VALUES'.
# - Replace escape single quote `\'` in MySQL to the PostgreSQL version `''`.
# - Replace escape double quote `\"` in MySQL to the PostgreSQL version `"`.
# - Handle backslashes `\\` by replacing them with a single backslash `\` for PostgreSQL.
clean_pages_sql() {
  local sql_input_file_directory="${1}"
  local sql_input="${sql_input_file_directory}/page.sql"
  local sql_output="${SQL_OUTPUT_DIRECTORY}/2-pages-inserts.sql"

  sed --quiet '/^INSERT INTO `page` VALUES (/p' "${sql_input}" |
    grep -oP "INSERT INTO \`page\` VALUES \(.+?\);" |
    sed 's/),(/)\n(/g' |
    grep -P "\([0-9]+,0,'.*?',0" |
    sed -E "s/^\(([0-9]+),0,'([^']*)',0.*\)$/\1,'\2'/" |
    sed "s/\\\'/''/g" | # Replace escaped single quotes
    sed 's/\\"/"/g' |   # Replace escaped double quotes
    sed 's/\\\\/\\/g' | # Replace double backslashes with a single backslash
    awk 'BEGIN {print "INSERT INTO pages VALUES"} {print "(" $0 "),"}' |
    sed '$ s/,$/;/g' >"$sql_output"

  echo "Cleaned \"${sql_input}\" to \"${sql_output}\"."
}

# clean_pages_sql "${DUMP_DIRECTORY}"
feat: start implementation to get Wikipedia dump and customize it 2024-08-05 00:37:06 +02:00			`#!/usr/bin/env bash`

			`# Usage: ./database-wikipedia.sh`
			`# Description: Download and extract Wikipedia database dumps.`

			`set -o errexit`
			`set -o nounset`
			`set -o pipefail`

			`DUMP_DIRECTORY="dump"`
			`SQL_OUTPUT_DIRECTORY="sql"`
			`DOWNLOAD_DATE="latest"`
			`WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"`

			`mkdir --parents "${DUMP_DIRECTORY}"`

			`download_file() {`
			`local filename="${1}"`
			`local file_path_output="${DUMP_DIRECTORY}/${filename}"`
			`local file_url="${WIKIPEDIA_DUMP_URL}${filename}"`

			`if [[ ! -f "${file_path_output}" ]]; then`
			`echo "Downloading \"${filename}\" from \"${file_url}\"..."`
			`wget --output-document="${file_path_output}" "${file_url}"`
			`else`
			`echo "File \"${filename}\" from \"${file_url}\" already exists."`
			`fi`
			`}`

			`# download_file "page.sql.gz"`
			`# download_file "pagelinks.sql.gz"`

			`extract_file() {`
			`local filename="${1}"`
			`local file_path_input="${DUMP_DIRECTORY}/${filename}"`
			`local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"`

			`if [[ ! -f "${file_path_output}" ]]; then`
			`echo "Extracting \"${filename}\" to \"${file_path_output}\"..."`
			`gzip --decompress "${file_path_input}"`

			# `--keep` flag to keep the original file, not needed here.
			`# gzip --decompress --keep "${file_path_input}"`
			`else`
			`echo "File \"${filename}\" already extracted."`
			`fi`
			`}`

			`# extract_file "page.sql.gz"`
			`# extract_file "pagelinks.sql.gz"`

			# Function to clean the `page.sql` file by:
			# - Removing all lines that don't start with `INSERT INTO...`.
			# - Filter by keeping rows where `page_namespace` (2nd column) is equal to 0, and where `page_is_redirect` (4th column) is equal to 0.
			# - Only keep columns `page_id` (1st column) and `page_title` (3rd column).
			# - Replace 'INSERT INTO `page` VALUES' with 'INSERT INTO pages VALUES'.
			# - Replace escape single quote `\'` in MySQL to the PostgreSQL version `''`.
			# - Replace escape double quote `\"` in MySQL to the PostgreSQL version `"`.
			# - Handle backslashes `\\` by replacing them with a single backslash `\` for PostgreSQL.
			`clean_pages_sql() {`
			`local sql_input_file_directory="${1}"`
			`local sql_input="${sql_input_file_directory}/page.sql"`
			`local sql_output="${SQL_OUTPUT_DIRECTORY}/2-pages-inserts.sql"`

			sed --quiet '/^INSERT INTO `page` VALUES (/p' "${sql_input}" \|
			grep -oP "INSERT INTO \`page\` VALUES \(.+?\);" \|
			`sed 's/),(/)\n(/g' \|`
			`grep -P "\([0-9]+,0,'.*?',0" \|`
			`sed -E "s/^\(([0-9]+),0,'([^'])',0.\)$/\1,'\2'/" \|`
			`sed "s/\\\'/''/g" \| # Replace escaped single quotes`
			`sed 's/\\"/"/g' \| # Replace escaped double quotes`
			`sed 's/\\\\/\\/g' \| # Replace double backslashes with a single backslash`
			`awk 'BEGIN {print "INSERT INTO pages VALUES"} {print "(" $0 "),"}' \|`
			`sed '$ s/,$/;/g' >"$sql_output"`

			`echo "Cleaned \"${sql_input}\" to \"${sql_output}\"."`
			`}`

			`# clean_pages_sql "${DUMP_DIRECTORY}"`