51 lines
1.4 KiB
Bash
Executable File
51 lines
1.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# Usage: ./download-wikipedia-dump.sh
|
|
# Description: Download and extract Wikipedia database dumps.
|
|
|
|
set -o errexit
|
|
set -o nounset
|
|
set -o pipefail
|
|
|
|
DUMP_DIRECTORY="dump"
|
|
DOWNLOAD_DATE="20240420"
|
|
# DOWNLOAD_DATE="latest"
|
|
WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
|
|
|
|
mkdir --parents "${DUMP_DIRECTORY}"
|
|
|
|
download_file() {
|
|
local filename="${1}"
|
|
local file_path_output="${DUMP_DIRECTORY}/${filename}"
|
|
local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
|
|
|
|
if [[ ! -f "${file_path_output%.gz}" ]]; then
|
|
echo "Downloading \"${filename}\" from \"${file_url}\"..."
|
|
wget --output-document="${file_path_output}" "${file_url}"
|
|
else
|
|
echo "File \"${filename%.gz}\" from \"${file_url}\" already exists."
|
|
fi
|
|
}
|
|
|
|
# download_file "page.sql.gz"
|
|
# download_file "pagelinks.sql.gz"
|
|
|
|
extract_file() {
|
|
local filename="${1}"
|
|
local file_path_input="${DUMP_DIRECTORY}/${filename}"
|
|
local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
|
|
|
|
if [[ ! -f "${file_path_output}" ]]; then
|
|
echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
|
|
gzip --decompress "${file_path_input}"
|
|
|
|
# `--keep` flag to keep the original file, not needed here.
|
|
# gzip --decompress --keep "${file_path_input}"
|
|
else
|
|
echo "File \"${filename}\" already extracted."
|
|
fi
|
|
}
|
|
|
|
extract_file "page.sql.gz"
|
|
extract_file "pagelinks.sql.gz"
|