download_crawl.sh 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. #!/bin/usr/env sh
  2. # Copyright (c) 2018-present, Facebook, Inc.
  3. # All rights reserved.
  4. #
  5. # This source code is licensed under the MIT license found in the
  6. # LICENSE file in the root directory of this source tree.
  7. set -e
  8. # Set this variable to the crawl you want to process.
  9. WET_PATHS_URL="https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2018-34/wet.paths.gz"
  10. # Set NUM_LANGID and NUM_DEDUP according to the capacity of your machine.
  11. # Please note that each dedup process uses 2GB of RAM, while langid is
  12. # mostly limited by cpu usage.
  13. NUM_LANGID=12
  14. NUM_DEDUP=8
  15. URL="https://commoncrawl.s3.amazonaws.com/"
  16. if [ ! -d fastText ]; then
  17. git clone https://github.com/facebookresearch/fastText.git
  18. fi
  19. if [ ! -f fastText/fasttext ]; then
  20. cd fastText
  21. make
  22. cd ..
  23. fi
  24. if [ ! -f lid.176.bin ]; then
  25. wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
  26. fi
  27. if [ ! -d tmp ]; then
  28. mkdir tmp
  29. fi
  30. if [ ! -d shard ]; then
  31. mkdir shard
  32. fi
  33. if [ ! -f wet.paths ]; then
  34. wget "${WET_PATHS_URL}"
  35. gunzip wet.paths.gz
  36. fi
  37. ## Language identification
  38. cat wet.paths | xargs -n 1 -P "${NUM_LANGID}" -I '{}' sh process_wet_file.sh "${URL}{}"
  39. ## Deduplication
  40. g++ -std=c++11 -O3 -o dedup dedup.cc
  41. g++ -std=c++11 -O3 -o filter_utf8 filter_utf8.cc
  42. find shard -name '*.txt' | xargs -n 1 -P "${NUM_DEDUP}" -I '{}' sh filter_dedup.sh "{}"
  43. ## Example of data filtering + tokenization
  44. git clone https://github.com/moses-smt/mosesdecoder.git
  45. perl mosesdecoder/scripts/tokenizer/tokenizer.perl -l es < shard/es.dedup > shard/es.tok