50 lines
1.3 KiB
Bash
50 lines
1.3 KiB
Bash
|
#!/usr/bin/env bash
|
||
|
|
||
|
# Usage: ./download-wikipedia-dump.sh
|
||
|
# Description: Download and extract Wikipedia database dumps.
|
||
|
|
||
|
set -o errexit
|
||
|
set -o nounset
|
||
|
set -o pipefail
|
||
|
|
||
|
DUMP_DIRECTORY="dump"
|
||
|
DOWNLOAD_DATE="latest"
|
||
|
WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-"
|
||
|
|
||
|
mkdir --parents "${DUMP_DIRECTORY}"
|
||
|
|
||
|
download_file() {
|
||
|
local filename="${1}"
|
||
|
local file_path_output="${DUMP_DIRECTORY}/${filename}"
|
||
|
local file_url="${WIKIPEDIA_DUMP_URL}${filename}"
|
||
|
|
||
|
if [[ ! -f "${file_path_output}" ]]; then
|
||
|
echo "Downloading \"${filename}\" from \"${file_url}\"..."
|
||
|
wget --output-document="${file_path_output}" "${file_url}"
|
||
|
else
|
||
|
echo "File \"${filename}\" from \"${file_url}\" already exists."
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
download_file "page.sql.gz"
|
||
|
download_file "pagelinks.sql.gz"
|
||
|
|
||
|
extract_file() {
|
||
|
local filename="${1}"
|
||
|
local file_path_input="${DUMP_DIRECTORY}/${filename}"
|
||
|
local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}"
|
||
|
|
||
|
if [[ ! -f "${file_path_output}" ]]; then
|
||
|
echo "Extracting \"${filename}\" to \"${file_path_output}\"..."
|
||
|
gzip --decompress "${file_path_input}"
|
||
|
|
||
|
# `--keep` flag to keep the original file, not needed here.
|
||
|
# gzip --decompress --keep "${file_path_input}"
|
||
|
else
|
||
|
echo "File \"${filename}\" already extracted."
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
extract_file "page.sql.gz"
|
||
|
extract_file "pagelinks.sql.gz"
|