7 роки тому · 61f1838321
--- a/crawl/README.md
+++ b/crawl/README.md
@@ -0,0 +1,26 @@
 
				+## Preprocessing Common Crawl
			
 
				+
			
 
				+This code downloads, preprocesses and splits per language the data from [Common Crawl](http://commoncrawl.org/).
			
 
				+
			
 
				+This script uses the scripts and language identifier of [1].
			
 
				+
			
 
				+This code inherits its requirements form [fastText](https://github.com/facebookresearch/fastText).
			
 
				+
			
 
				+Set the variable WET_PATHS_URL to the crawl you want to process.
			
 
				+Please also set the variables NUM_LANGID and NUM_DEDUP in `download_crawl.sh` according to the capacity of your machine.
			
 
				+Langid processes are mostly limited by CPU usage, while dedup processes are likely to be limited by RAM usage (each use 2GB of RAM).
			
 
				+
			
 
				+### Reference
			
 
				+
			
 
				+If you use this code, please cite:
			
 
				+
			
 
				+[1] E. Grave*, P. Bojanowski*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893)
			
 
				+
			
 
				+```
			
 
				+@inproceedings{grave2018learning,
			
 
				+  title={Learning Word Vectors for 157 Languages},
			
 
				+  author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
			
 
				+  booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
			
 
				+  year={2018}
			
 
				+}
			
 
				+```
			
--- a/crawl/dedup.cc
+++ b/crawl/dedup.cc
@@ -0,0 +1,51 @@
 
				+// Copyright (c) 2018-present, Facebook, Inc.
			
 
				+// All rights reserved.
			
 
				+//
			
 
				+// This source code is licensed under the MIT license found in the
			
 
				+// LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+#include <cstdint>
			
 
				+#include <iostream>
			
 
				+#include <fstream>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+uint64_t fnv1a_64(uint8_t *data, size_t sz, uint64_t h=14695981039346656037ull)
			
 
				+{
			
 
				+  for (size_t i = 0; i < sz; i++, data++) {
			
 
				+    h ^= uint64_t(*data);
			
 
				+    h *= 1099511628211ull;
			
 
				+  }
			
 
				+  return h;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char** argv)
			
 
				+{
			
 
				+  uint64_t init_values[] = {
			
 
				+    14695981039346656037ull,
			
 
				+    9425296925403859339ull,
			
 
				+    13716263814064014149ull,
			
 
				+    3525492407291847033ull,
			
 
				+    8607404175481815707ull,
			
 
				+    9818874561736458749ull,
			
 
				+    10026508429719773353ull,
			
 
				+    3560712257386009938ull
			
 
				+  };
			
 
				+  size_t n = 1ull<<34, num_hashes = 2;
			
 
				+  std::vector<bool> seen(n);
			
 
				+
			
 
				+  std::ios_base::sync_with_stdio(false);
			
 
				+
			
 
				+  for (std::string line; std::getline(std::cin, line);) {
			
 
				+    bool b = true;
			
 
				+    for (size_t i = 0; i < num_hashes; i++) {
			
 
				+      uint64_t h = fnv1a_64((uint8_t*) line.data(), line.length(), init_values[i]) % n;
			
 
				+      b = b && seen[h];
			
 
				+      seen[h] = true;
			
 
				+    }
			
 
				+    if (!b) {
			
 
				+      std::cout << line << std::endl;
			
 
				+    }
			
 
				+  }
			
 
				+  return 0;
			
 
				+}
			
--- a/crawl/download_crawl.sh
+++ b/crawl/download_crawl.sh
@@ -0,0 +1,57 @@
 
				+#!/bin/usr/env sh
			
 
				+# Copyright (c) 2018-present, Facebook, Inc.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the MIT license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+# Set this variable to the crawl you want to process.
			
 
				+WET_PATHS_URL="https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2018-34/wet.paths.gz"
			
 
				+
			
 
				+# Set NUM_LANGID and NUM_DEDUP according to the capacity of your machine.
			
 
				+# Please note that each dedup process uses 2GB of RAM, while langid is
			
 
				+# mostly limited by cpu usage.
			
 
				+NUM_LANGID=12
			
 
				+NUM_DEDUP=8
			
 
				+URL="https://commoncrawl.s3.amazonaws.com/"
			
 
				+
			
 
				+if [ ! -d fastText ]; then
			
 
				+    git clone https://github.com/facebookresearch/fastText.git
			
 
				+fi
			
 
				+
			
 
				+if [ ! -f fastText/fasttext ]; then
			
 
				+    cd fastText
			
 
				+    make
			
 
				+    cd ..
			
 
				+fi
			
 
				+
			
 
				+if [ ! -f lid.176.bin ]; then
			
 
				+    wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/supervised_models/lid.176.bin
			
 
				+fi
			
 
				+
			
 
				+if [ ! -d tmp ]; then
			
 
				+    mkdir tmp
			
 
				+fi
			
 
				+
			
 
				+if [ ! -d shard ]; then
			
 
				+    mkdir shard
			
 
				+fi
			
 
				+
			
 
				+if [ ! -f wet.paths ]; then
			
 
				+    wget "${WET_PATHS_URL}"
			
 
				+    gunzip wet.paths.gz
			
 
				+fi
			
 
				+
			
 
				+## Language identification
			
 
				+cat wet.paths | xargs -n 1 -P "${NUM_LANGID}" -I '{}' sh process_wet_file.sh "${URL}{}"
			
 
				+
			
 
				+## Deduplication
			
 
				+g++ -std=c++11 -O3 -o dedup dedup.cc
			
 
				+g++ -std=c++11 -O3 -o filter_utf8 filter_utf8.cc
			
 
				+find shard -name '*.txt' | xargs -n 1 -P "${NUM_DEDUP}" -I '{}' sh filter_dedup.sh "{}"
			
 
				+
			
 
				+## Example of data filtering + tokenization
			
 
				+git clone https://github.com/moses-smt/mosesdecoder.git
			
 
				+perl mosesdecoder/scripts/tokenizer/tokenizer.perl -l es < shard/es.dedup > shard/es.tok
			
--- a/crawl/filter_dedup.sh
+++ b/crawl/filter_dedup.sh
@@ -0,0 +1,13 @@
 
				+#!/bin/usr/env sh
			
 
				+# Copyright (c) 2018-present, Facebook, Inc.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the MIT license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+LG=$(basename --suffix=".txt" "${1}")
			
 
				+
			
 
				+./filter_utf8 < "shard/${LG}.txt" \
			
 
				+    | ./dedup > "shard/${LG}.dedup"
			
--- a/crawl/filter_utf8.cc
+++ b/crawl/filter_utf8.cc
@@ -0,0 +1,105 @@
 
				+// Copyright (c) 2018-present, Facebook, Inc.
			
 
				+// All rights reserved.
			
 
				+//
			
 
				+// This source code is licensed under the MIT license found in the
			
 
				+// LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+#include <cstdint>
			
 
				+#include <iostream>
			
 
				+#include <string>
			
 
				+
			
 
				+// Check that the next n bytes are continuation bytes.
			
 
				+bool continuation(uint8_t* str, int n)
			
 
				+{
			
 
				+  for (int i = 0; i < n; i++) {
			
 
				+    if ((str[i] & 0xc0) != 0x80) return false;
			
 
				+  }
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+// Invalid UTF8 correspond to codepoints which are larger than U+10FFFF.
			
 
				+// This value is encoded in UTF8 as:
			
 
				+//  * 11110.100 10.001111 10.111111 10.111111
			
 
				+// We thus check if the first byte is larger than 0xf4, or if it is equal
			
 
				+// to 0xf4 and the second byte is larger than 0x8f.
			
 
				+bool invalid(uint8_t* str)
			
 
				+{
			
 
				+  return str[0] > 0xf4 || (str[0] == 0xf4 && str[1] > 0x8f);
			
 
				+}
			
 
				+
			
 
				+// Surrogate halves corresponds to the range U+D800 through U+DFFF,
			
 
				+// which are encoded in UTF8 as:
			
 
				+//  * 1110.1101 10.100000 10.000000
			
 
				+//  * 1110.1101 10.111111 10.111111
			
 
				+// We thus check is the first byte is equal to 0xed and if the
			
 
				+// sixth bit of the second byte is set.
			
 
				+bool surrogate(uint8_t* str)
			
 
				+{
			
 
				+  return str[0] == 0xed && str[1] & 0x20;
			
 
				+}
			
 
				+
			
 
				+// Sequences of length 2 are overlong if the leading 4 bits (noted as y)
			
 
				+// are equal to 0: 110.yyyyx 10xxxxxx
			
 
				+bool overlong_2(uint8_t* str)
			
 
				+{
			
 
				+  return (str[0] & 0x1e) == 0;
			
 
				+}
			
 
				+
			
 
				+// Sequences of lenth 3 are overlong if the leading 5 bits (noted as y)
			
 
				+// are equal to 0: 1110.yyyy 10.yxxxxx 10.xxxxxx
			
 
				+bool overlong_3(uint8_t* str)
			
 
				+{
			
 
				+  return (str[0] & 0x0f) == 0 && (str[1] & 0x20) == 0;
			
 
				+}
			
 
				+
			
 
				+// Sequences of length 4 are overlong if the leading 5 bits (noted as y)
			
 
				+// are equal to 0: 11110.yyy 10.yyxxxx 10.xxxxxx 10.xxxxxx
			
 
				+bool overlong_4(uint8_t* str)
			
 
				+{
			
 
				+  return (str[0] & 0x07) == 0 && (str[1] & 0x30) == 0;
			
 
				+}
			
 
				+
			
 
				+bool valid_utf8(uint8_t* str, size_t length)
			
 
				+{
			
 
				+  uint8_t* end = str + length;
			
 
				+  while (str < end) {
			
 
				+    if (str[0] < 0x80) {
			
 
				+      // 0.xxxxxxx
			
 
				+      str += 1;
			
 
				+    } else if ((str[0] & 0xe0) == 0xc0) {
			
 
				+      // 110.xxxxx 10.xxxxxx
			
 
				+      if (str + 1 >= end) return false;
			
 
				+      if (!continuation(str + 1, 1)) return false;
			
 
				+      if (overlong_2(str)) return false;
			
 
				+      str += 2;
			
 
				+    } else if ((str[0] & 0xf0) == 0xe0) {
			
 
				+      // 1110.xxxx 10.xxxxxx 10.xxxxxx
			
 
				+      if (str + 2 >= end) return false;
			
 
				+      if (!continuation(str + 1, 2)) return false;
			
 
				+      if (overlong_3(str)) return false;
			
 
				+      if (surrogate(str)) return false;
			
 
				+      str += 3;
			
 
				+    } else if ((str[0] & 0xf8) == 0xf0) {
			
 
				+      // 11110.xxx 10.xxxxxx 10.xxxxxx 10.xxxxxx
			
 
				+      if (str + 3 >= end) return false;
			
 
				+      if (!continuation(str + 1, 3)) return false;
			
 
				+      if (overlong_4(str)) return false;
			
 
				+      if (invalid(str)) return false;
			
 
				+      str += 4;
			
 
				+    } else {
			
 
				+      return false;
			
 
				+    }
			
 
				+  }
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char** argv)
			
 
				+{
			
 
				+  std::ios_base::sync_with_stdio(false);
			
 
				+  for (std::string line; std::getline(std::cin, line);) {
			
 
				+    if (valid_utf8((uint8_t*) line.data(), line.length())) {
			
 
				+      std::cout << line << std::endl;
			
 
				+    }
			
 
				+  }
			
 
				+  return 0;
			
 
				+}
			
--- a/crawl/process_wet_file.sh
+++ b/crawl/process_wet_file.sh
@@ -0,0 +1,30 @@
 
				+#!/bin/usr/env sh
			
 
				+# Copyright (c) 2018-present, Facebook, Inc.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the MIT license found in the
			
 
				+# LICENSE file in the root directory of this source tree.
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+URL=$1
			
 
				+
			
 
				+FILENAME=$(basename --suffix=".warc.wet.gz" "${URL}")
			
 
				+
			
 
				+echo "Processing ${FILENAME}."
			
 
				+
			
 
				+wget -q -P tmp "${URL}"
			
 
				+
			
 
				+#echo "Extracting ${FILENAME}.warc.wet.gz"
			
 
				+gunzip "tmp/${FILENAME}.warc.wet.gz"
			
 
				+
			
 
				+#echo "Language identification for ${FILENAME}.warc.wet"
			
 
				+fastText/fasttext predict-prob lid.176.bin "tmp/${FILENAME}.warc.wet" > "tmp/${FILENAME}.lid"
			
 
				+
			
 
				+#echo "Splitting ${FILENAME}.warc.wet per language"
			
 
				+paste "tmp/${FILENAME}.lid" "tmp/${FILENAME}.warc.wet" | \
			
 
				+    awk '($2 > 0.8 || ($1=="__label__hr" && $2 > 0.4)) && length() > 100 {lang = substr($1, 10); $1=""; $2=""; print $0 >> "shard/"lang".txt"}'
			
 
				+
			
 
				+#echo "Removing tmp files"
			
 
				+rm "tmp/${FILENAME}.lid"
			
 
				+rm "tmp/${FILENAME}.warc.wet"