get-wikimedia.sh 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. #!/usr/bin/env bash
  2. #
  3. # Copyright (c) 2016-present, Facebook, Inc.
  4. # All rights reserved.
  5. #
  6. # This source code is licensed under the MIT license found in the
  7. # LICENSE file in the root directory of this source tree.
  8. #
  9. set -e
  10. normalize_text() {
  11. sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
  12. -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
  13. -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
  14. -e 's/«/ /g' | tr 0-9 " "
  15. }
  16. export LANGUAGE=en_US.UTF-8
  17. export LC_ALL=en_US.UTF-8
  18. export LANG=en_US.UTF-8
  19. NOW=$(date +"%Y%m%d")
  20. ROOT="data/wikimedia/${NOW}"
  21. mkdir -p "${ROOT}"
  22. echo "Saving data in ""$ROOT"
  23. read -r -p "Choose a language (e.g. en, bh, fr, etc.): " choice
  24. LANG="$choice"
  25. echo "Chosen language: ""$LANG"
  26. read -r -p "Continue to download (WARNING: This might be big and can take a long time!)(y/n)? " choice
  27. case "$choice" in
  28. y|Y ) echo "Starting download...";;
  29. n|N ) echo "Exiting";exit 1;;
  30. * ) echo "Invalid answer";exit 1;;
  31. esac
  32. wget -c "https://dumps.wikimedia.org/""$LANG""wiki/latest/""${LANG}""wiki-latest-pages-articles.xml.bz2" -P "${ROOT}"
  33. echo "Processing ""$ROOT"/"$LANG""wiki-latest-pages-articles.xml.bz2"
  34. bzip2 -c -d "$ROOT"/"$LANG""wiki-latest-pages-articles.xml.bz2" | awk '{print tolower($0);}' | perl -e '
  35. # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
  36. # letters (a-z, converted from A-Z), and spaces (never consecutive)...
  37. # All other characters are converted to spaces. Only text which normally appears.
  38. # in the web browser is displayed. Tables are removed. Image captions are.
  39. # preserved. Links are converted to normal text. Digits are spelled out.
  40. # *** Modified to not spell digits or throw away non-ASCII characters ***
  41. # Written by Matt Mahoney, June 10, 2006. This program is released to the public domain.
  42. $/=">"; # input record separator
  43. while (<>) {
  44. if (/<text /) {$text=1;} # remove all but between <text> ... </text>
  45. if (/#redirect/i) {$text=0;} # remove #REDIRECT
  46. if ($text) {
  47. # Remove any text not normally visible
  48. if (/<\/text>/) {$text=0;}
  49. s/<.*>//; # remove xml tags
  50. s/&amp;/&/g; # decode URL encoded chars
  51. s/&lt;/</g;
  52. s/&gt;/>/g;
  53. s/<ref[^<]*<\/ref>//g; # remove references <ref...> ... </ref>
  54. s/<[^>]*>//g; # remove xhtml tags
  55. s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text
  56. s/\|thumb//ig; # remove images links, preserve caption
  57. s/\|left//ig;
  58. s/\|right//ig;
  59. s/\|\d+px//ig;
  60. s/\[\[image:[^\[\]]*\|//ig;
  61. s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup
  62. s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages
  63. s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text
  64. s/{{[^}]*}}//g; # remove {{icons}} and {tables}
  65. s/{[^}]*}//g;
  66. s/\[//g; # remove [ and ]
  67. s/\]//g;
  68. s/&[^;]*;/ /g; # remove URL encoded chars
  69. $_=" $_ ";
  70. chop;
  71. print $_;
  72. }
  73. }
  74. ' | normalize_text | awk '{if (NF>1) print;}' | tr -s " " | shuf > "${ROOT}"/wiki."${LANG}".txt