#!/usr/bin/env bash
# Usage: ./download-wikipedia-dump.sh
# Description: Download and extract Wikipedia database dumps.
set -o errexit
set -o nounset
set -o pipefail
DUMP_DIRECTORY="dump"
DOWNLOAD_DATE="20240420"
# DOWNLOAD_DATE="latest"
WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
mkdir --parents "${DUMP_DIRECTORY}"
download_file() {
local filename="${1}"
local file_path_output="${DUMP_DIRECTORY}/${filename}"
local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
if [[ ! -f "${file_path_output%.gz}" ]]; then
echo "Downloading \"${filename}\" from \"${file_url}\"..."
wget --output-document="${file_path_output}" "${file_url}"
else
echo "File \"${filename%.gz}\" from \"${file_url}\" already exists."
fi
}
# download_file "page.sql.gz"
# download_file "pagelinks.sql.gz"
extract_file() {
local file_path_input="${DUMP_DIRECTORY}/${filename}"
local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
if [[ ! -f "${file_path_output}" ]]; then
echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
gzip --decompress "${file_path_input}"
# `--keep` flag to keep the original file, not needed here.
# gzip --decompress --keep "${file_path_input}"
echo "File \"${filename}\" already extracted."
extract_file "page.sql.gz"
extract_file "pagelinks.sql.gz"