process_wet_file.sh 912 B

123456789101112131415161718192021222324252627282930
  1. #!/bin/usr/env sh
  2. # Copyright (c) 2018-present, Facebook, Inc.
  3. # All rights reserved.
  4. #
  5. # This source code is licensed under the MIT license found in the
  6. # LICENSE file in the root directory of this source tree.
  7. set -e
  8. URL=$1
  9. FILENAME=$(basename --suffix=".warc.wet.gz" "${URL}")
  10. echo "Processing ${FILENAME}."
  11. wget -q -P tmp "${URL}"
  12. #echo "Extracting ${FILENAME}.warc.wet.gz"
  13. gunzip "tmp/${FILENAME}.warc.wet.gz"
  14. #echo "Language identification for ${FILENAME}.warc.wet"
  15. fastText/fasttext predict-prob lid.176.bin "tmp/${FILENAME}.warc.wet" > "tmp/${FILENAME}.lid"
  16. #echo "Splitting ${FILENAME}.warc.wet per language"
  17. paste "tmp/${FILENAME}.lid" "tmp/${FILENAME}.warc.wet" | \
  18. awk '($2 > 0.8 || ($1=="__label__hr" && $2 > 0.4)) && length() > 100 {lang = substr($1, 10); $1=""; $2=""; print $0 >> "shard/"lang".txt"}'
  19. #echo "Removing tmp files"
  20. rm "tmp/${FILENAME}.lid"
  21. rm "tmp/${FILENAME}.warc.wet"