#!/usr/bin/env bash # # Copyright (c) 2016-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # set -e normalize_text() { sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/
/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ -e 's/«/ /g' | tr 0-9 " " } export LANGUAGE=en_US.UTF-8 export LC_ALL=en_US.UTF-8 export LANG=en_US.UTF-8 NOW=$(date +"%Y%m%d") ROOT="data/wikimedia/${NOW}" mkdir -p "${ROOT}" echo "Saving data in ""$ROOT" read -r -p "Choose a language (e.g. en, bh, fr, etc.): " choice LANG="$choice" echo "Chosen language: ""$LANG" read -r -p "Continue to download (WARNING: This might be big and can take a long time!)(y/n)? " choice case "$choice" in y|Y ) echo "Starting download...";; n|N ) echo "Exiting";exit 1;; * ) echo "Invalid answer";exit 1;; esac wget -c "https://dumps.wikimedia.org/""$LANG""wiki/latest/""${LANG}""wiki-latest-pages-articles.xml.bz2" -P "${ROOT}" echo "Processing ""$ROOT"/"$LANG""wiki-latest-pages-articles.xml.bz2" bzip2 -c -d "$ROOT"/"$LANG""wiki-latest-pages-articles.xml.bz2" | awk '{print tolower($0);}' | perl -e ' # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase # letters (a-z, converted from A-Z), and spaces (never consecutive)... # All other characters are converted to spaces. Only text which normally appears. # in the web browser is displayed. Tables are removed. Image captions are. # preserved. Links are converted to normal text. Digits are spelled out. # *** Modified to not spell digits or throw away non-ASCII characters *** # Written by Matt Mahoney, June 10, 2006. This program is released to the public domain. $/=">"; # input record separator while (<>) { if (/ ... if (/#redirect/i) {$text=0;} # remove #REDIRECT if ($text) { # Remove any text not normally visible if (/<\/text>/) {$text=0;} s/<.*>//; # remove xml tags s/&/&/g; # decode URL encoded chars s/<//g; s///g; # remove references ... s/<[^>]*>//g; # remove xhtml tags s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text s/\|thumb//ig; # remove images links, preserve caption s/\|left//ig; s/\|right//ig; s/\|\d+px//ig; s/\[\[image:[^\[\]]*\|//ig; s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text s/{{[^}]*}}//g; # remove {{icons}} and {tables} s/{[^}]*}//g; s/\[//g; # remove [ and ] s/\]//g; s/&[^;]*;/ /g; # remove URL encoded chars $_=" $_ "; chop; print $_; } } ' | normalize_text | awk '{if (NF>1) print;}' | tr -s " " | shuf > "${ROOT}"/wiki."${LANG}".txt