#!/usr/bin/env bash # Usage: ./download-wikipedia-dump.sh # Description: Download and extract Wikipedia database dumps. set -o errexit set -o nounset set -o pipefail DUMP_DIRECTORY="dump" DOWNLOAD_DATE="latest" WIKIPEDIA_DUMP_URL="https://dumps.wikimedia.org/enwiki/${DOWNLOAD_DATE}/enwiki-${DOWNLOAD_DATE}-" mkdir --parents "${DUMP_DIRECTORY}" download_file() { local filename="${1}" local file_path_output="${DUMP_DIRECTORY}/${filename}" local file_url="${WIKIPEDIA_DUMP_URL}${filename}" if [[ ! -f "${file_path_output}" ]]; then echo "Downloading \"${filename}\" from \"${file_url}\"..." wget --output-document="${file_path_output}" "${file_url}" else echo "File \"${filename}\" from \"${file_url}\" already exists." fi } download_file "page.sql.gz" download_file "pagelinks.sql.gz" extract_file() { local filename="${1}" local file_path_input="${DUMP_DIRECTORY}/${filename}" local file_path_output="${DUMP_DIRECTORY}/${filename%.gz}" if [[ ! -f "${file_path_output}" ]]; then echo "Extracting \"${filename}\" to \"${file_path_output}\"..." gzip --decompress "${file_path_input}" # `--keep` flag to keep the original file, not needed here. # gzip --decompress --keep "${file_path_input}" else echo "File \"${filename}\" already extracted." fi } extract_file "page.sql.gz" extract_file "pagelinks.sql.gz"