1
0

wikifil.pl 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. #!/usr/bin/perl
  2. # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
  3. # letters (a-z, converted from A-Z), and spaces (never consecutive).
  4. # All other characters are converted to spaces. Only text which normally appears
  5. # in the web browser is displayed. Tables are removed. Image captions are
  6. # preserved. Links are converted to normal text. Digits are spelled out.
  7. # Written by Matt Mahoney, June 10, 2006. This program is released to the public domain.
  8. $/=">"; # input record separator
  9. while (<>) {
  10. if (/<text /) {$text=1;} # remove all but between <text> ... </text>
  11. if (/#redirect/i) {$text=0;} # remove #REDIRECT
  12. if ($text) {
  13. # Remove any text not normally visible
  14. if (/<\/text>/) {$text=0;}
  15. s/<.*>//; # remove xml tags
  16. s/&amp;/&/g; # decode URL encoded chars
  17. s/&lt;/</g;
  18. s/&gt;/>/g;
  19. s/<ref[^<]*<\/ref>//g; # remove references <ref...> ... </ref>
  20. s/<[^>]*>//g; # remove xhtml tags
  21. s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text
  22. s/\|thumb//ig; # remove images links, preserve caption
  23. s/\|left//ig;
  24. s/\|right//ig;
  25. s/\|\d+px//ig;
  26. s/\[\[image:[^\[\]]*\|//ig;
  27. s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup
  28. s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages
  29. s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text
  30. s/\{\{[^\}]*\}\}//g; # remove {{icons}} and {tables}
  31. s/\{[^\}]*\}//g;
  32. s/\[//g; # remove [ and ]
  33. s/\]//g;
  34. s/&[^;]*;/ /g; # remove URL encoded chars
  35. # convert to lowercase letters and spaces, spell digits
  36. $_=" $_ ";
  37. tr/A-Z/a-z/;
  38. s/0/ zero /g;
  39. s/1/ one /g;
  40. s/2/ two /g;
  41. s/3/ three /g;
  42. s/4/ four /g;
  43. s/5/ five /g;
  44. s/6/ six /g;
  45. s/7/ seven /g;
  46. s/8/ eight /g;
  47. s/9/ nine /g;
  48. tr/a-z/ /cs;
  49. chop;
  50. print $_;
  51. }
  52. }