|
|
@@ -0,0 +1,80 @@
|
|
|
+#!/usr/bin/env bash
|
|
|
+#
|
|
|
+# Copyright (c) 2016-present, Facebook, Inc.
|
|
|
+# All rights reserved.
|
|
|
+#
|
|
|
+# This source code is licensed under the BSD-style license found in the
|
|
|
+# LICENSE file in the root directory of this source tree. An additional grant
|
|
|
+# of patent rights can be found in the PATENTS file in the same directory.
|
|
|
+#
|
|
|
+
|
|
|
+set -e
|
|
|
+
|
|
|
+normalize_text() {
|
|
|
+ sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
|
|
|
+ -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
|
|
|
+ -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
|
|
|
+ -e 's/«/ /g' | tr 0-9 " "
|
|
|
+}
|
|
|
+
|
|
|
+export LANGUAGE=en_US.UTF-8
|
|
|
+export LC_ALL=en_US.UTF-8
|
|
|
+export LANG=en_US.UTF-8
|
|
|
+
|
|
|
+NOW=$(date +"%Y%m%d")
|
|
|
+
|
|
|
+ROOT="data/wikimedia/${NOW}"
|
|
|
+mkdir -p "${ROOT}"
|
|
|
+echo "Saving data in ""$ROOT"
|
|
|
+read -r -p "Choose a language (e.g. en, bh, fr, etc.): " choice
|
|
|
+LANG="$choice"
|
|
|
+echo "Chosen language: ""$LANG"
|
|
|
+read -r -p "Continue to download (WARNING: This might be big and can take a long time!)(y/n)? " choice
|
|
|
+case "$choice" in
|
|
|
+ y|Y ) echo "Starting download...";;
|
|
|
+ n|N ) echo "Exiting";exit 1;;
|
|
|
+ * ) echo "Invalid answer";exit 1;;
|
|
|
+esac
|
|
|
+wget -c "https://dumps.wikimedia.org/""$LANG""wiki/latest/""${LANG}""wiki-latest-pages-articles.xml.bz2" -P "${ROOT}"
|
|
|
+echo "Processing ""$ROOT"/"$LANG""wiki-latest-pages-articles.xml.bz2"
|
|
|
+bzip2 -c -d "$ROOT"/"$LANG""wiki-latest-pages-articles.xml.bz2" | awk '{print tolower($0);}' | perl -e '
|
|
|
+# Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
|
|
|
+# letters (a-z, converted from A-Z), and spaces (never consecutive)...
|
|
|
+# All other characters are converted to spaces. Only text which normally appears.
|
|
|
+# in the web browser is displayed. Tables are removed. Image captions are.
|
|
|
+# preserved. Links are converted to normal text. Digits are spelled out.
|
|
|
+# *** Modified to not spell digits or throw away non-ASCII characters ***
|
|
|
+# Written by Matt Mahoney, June 10, 2006. This program is released to the public domain.
|
|
|
+$/=">"; # input record separator
|
|
|
+while (<>) {
|
|
|
+ if (/<text /) {$text=1;} # remove all but between <text> ... </text>
|
|
|
+ if (/#redirect/i) {$text=0;} # remove #REDIRECT
|
|
|
+ if ($text) {
|
|
|
+ # Remove any text not normally visible
|
|
|
+ if (/<\/text>/) {$text=0;}
|
|
|
+ s/<.*>//; # remove xml tags
|
|
|
+ s/&/&/g; # decode URL encoded chars
|
|
|
+ s/</</g;
|
|
|
+ s/>/>/g;
|
|
|
+ s/<ref[^<]*<\/ref>//g; # remove references <ref...> ... </ref>
|
|
|
+ s/<[^>]*>//g; # remove xhtml tags
|
|
|
+ s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text
|
|
|
+ s/\|thumb//ig; # remove images links, preserve caption
|
|
|
+ s/\|left//ig;
|
|
|
+ s/\|right//ig;
|
|
|
+ s/\|\d+px//ig;
|
|
|
+ s/\[\[image:[^\[\]]*\|//ig;
|
|
|
+ s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup
|
|
|
+ s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages
|
|
|
+ s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text
|
|
|
+ s/{{[^}]*}}//g; # remove {{icons}} and {tables}
|
|
|
+ s/{[^}]*}//g;
|
|
|
+ s/\[//g; # remove [ and ]
|
|
|
+ s/\]//g;
|
|
|
+ s/&[^;]*;/ /g; # remove URL encoded chars
|
|
|
+ $_=" $_ ";
|
|
|
+ chop;
|
|
|
+ print $_;
|
|
|
+ }
|
|
|
+}
|
|
|
+' | normalize_text | awk '{if (NF>1) print;}' | tr -s " " | shuf > "${ROOT}"/wiki."${LANG}".txt
|