Переглянути джерело

Common crawl processing scripts

Summary: Preprocessing scripts for common crawl

Reviewed By: ajoulin

Differential Revision: D9539724

fbshipit-source-id: 72b1b0961ec4aa796add9f20246df4c40459f7b9
Edouard Grave 7 роки тому
батько
коміт
61f1838321
6 змінених файлів з 282 додано та 0 видалено
  1. 26 0
      crawl/README.md
  2. 51 0
      crawl/dedup.cc
  3. 57 0
      crawl/download_crawl.sh
  4. 13 0
      crawl/filter_dedup.sh
  5. 105 0
      crawl/filter_utf8.cc
  6. 30 0
      crawl/process_wet_file.sh

+ 26 - 0
crawl/README.md

@@ -0,0 +1,26 @@
+## Preprocessing Common Crawl
+
+This code downloads, preprocesses and splits per language the data from [Common Crawl](http://commoncrawl.org/).
+
+This script uses the scripts and language identifier of [1].
+
+This code inherits its requirements form [fastText](https://github.com/facebookresearch/fastText).
+
+Set the variable WET_PATHS_URL to the crawl you want to process.
+Please also set the variables NUM_LANGID and NUM_DEDUP in `download_crawl.sh` according to the capacity of your machine.
+Langid processes are mostly limited by CPU usage, while dedup processes are likely to be limited by RAM usage (each use 2GB of RAM).
+
+### Reference
+
+If you use this code, please cite:
+
+[1] E. Grave*, P. Bojanowski*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893)
+
+```
+@inproceedings{grave2018learning,
+  title={Learning Word Vectors for 157 Languages},
+  author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
+  booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
+  year={2018}
+}
+```

+ 51 - 0
crawl/dedup.cc

@@ -0,0 +1,51 @@
+// Copyright (c) 2018-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <cstdint>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+uint64_t fnv1a_64(uint8_t *data, size_t sz, uint64_t h=14695981039346656037ull)
+{
+  for (size_t i = 0; i < sz; i++, data++) {
+    h ^= uint64_t(*data);
+    h *= 1099511628211ull;
+  }
+  return h;
+}
+
+int main(int argc, char** argv)
+{
+  uint64_t init_values[] = {
+    14695981039346656037ull,
+    9425296925403859339ull,
+    13716263814064014149ull,
+    3525492407291847033ull,
+    8607404175481815707ull,
+    9818874561736458749ull,
+    10026508429719773353ull,
+    3560712257386009938ull
+  };
+  size_t n = 1ull<<34, num_hashes = 2;
+  std::vector<bool> seen(n);
+
+  std::ios_base::sync_with_stdio(false);
+
+  for (std::string line; std::getline(std::cin, line);) {
+    bool b = true;
+    for (size_t i = 0; i < num_hashes; i++) {
+      uint64_t h = fnv1a_64((uint8_t*) line.data(), line.length(), init_values[i]) % n;
+      b = b && seen[h];
+      seen[h] = true;
+    }
+    if (!b) {
+      std::cout << line << std::endl;
+    }
+  }
+  return 0;
+}

+ 57 - 0
crawl/download_crawl.sh

@@ -0,0 +1,57 @@
+#!/bin/usr/env sh
+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+# Set this variable to the crawl you want to process.
+WET_PATHS_URL="https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2018-34/wet.paths.gz"
+
+# Set NUM_LANGID and NUM_DEDUP according to the capacity of your machine.
+# Please note that each dedup process uses 2GB of RAM, while langid is
+# mostly limited by cpu usage.
+NUM_LANGID=12
+NUM_DEDUP=8
+URL="https://commoncrawl.s3.amazonaws.com/"
+
+if [ ! -d fastText ]; then
+    git clone https://github.com/facebookresearch/fastText.git
+fi
+
+if [ ! -f fastText/fasttext ]; then
+    cd fastText
+    make
+    cd ..
+fi
+
+if [ ! -f lid.176.bin ]; then
+    wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/supervised_models/lid.176.bin
+fi
+
+if [ ! -d tmp ]; then
+    mkdir tmp
+fi
+
+if [ ! -d shard ]; then
+    mkdir shard
+fi
+
+if [ ! -f wet.paths ]; then
+    wget "${WET_PATHS_URL}"
+    gunzip wet.paths.gz
+fi
+
+## Language identification
+cat wet.paths | xargs -n 1 -P "${NUM_LANGID}" -I '{}' sh process_wet_file.sh "${URL}{}"
+
+## Deduplication
+g++ -std=c++11 -O3 -o dedup dedup.cc
+g++ -std=c++11 -O3 -o filter_utf8 filter_utf8.cc
+find shard -name '*.txt' | xargs -n 1 -P "${NUM_DEDUP}" -I '{}' sh filter_dedup.sh "{}"
+
+## Example of data filtering + tokenization
+git clone https://github.com/moses-smt/mosesdecoder.git
+perl mosesdecoder/scripts/tokenizer/tokenizer.perl -l es < shard/es.dedup > shard/es.tok

+ 13 - 0
crawl/filter_dedup.sh

@@ -0,0 +1,13 @@
+#!/bin/usr/env sh
+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+LG=$(basename --suffix=".txt" "${1}")
+
+./filter_utf8 < "shard/${LG}.txt" \
+    | ./dedup > "shard/${LG}.dedup"

+ 105 - 0
crawl/filter_utf8.cc

@@ -0,0 +1,105 @@
+// Copyright (c) 2018-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <cstdint>
+#include <iostream>
+#include <string>
+
+// Check that the next n bytes are continuation bytes.
+bool continuation(uint8_t* str, int n)
+{
+  for (int i = 0; i < n; i++) {
+    if ((str[i] & 0xc0) != 0x80) return false;
+  }
+  return true;
+}
+
+// Invalid UTF8 correspond to codepoints which are larger than U+10FFFF.
+// This value is encoded in UTF8 as:
+//  * 11110.100 10.001111 10.111111 10.111111
+// We thus check if the first byte is larger than 0xf4, or if it is equal
+// to 0xf4 and the second byte is larger than 0x8f.
+bool invalid(uint8_t* str)
+{
+  return str[0] > 0xf4 || (str[0] == 0xf4 && str[1] > 0x8f);
+}
+
+// Surrogate halves corresponds to the range U+D800 through U+DFFF,
+// which are encoded in UTF8 as:
+//  * 1110.1101 10.100000 10.000000
+//  * 1110.1101 10.111111 10.111111
+// We thus check is the first byte is equal to 0xed and if the
+// sixth bit of the second byte is set.
+bool surrogate(uint8_t* str)
+{
+  return str[0] == 0xed && str[1] & 0x20;
+}
+
+// Sequences of length 2 are overlong if the leading 4 bits (noted as y)
+// are equal to 0: 110.yyyyx 10xxxxxx
+bool overlong_2(uint8_t* str)
+{
+  return (str[0] & 0x1e) == 0;
+}
+
+// Sequences of lenth 3 are overlong if the leading 5 bits (noted as y)
+// are equal to 0: 1110.yyyy 10.yxxxxx 10.xxxxxx
+bool overlong_3(uint8_t* str)
+{
+  return (str[0] & 0x0f) == 0 && (str[1] & 0x20) == 0;
+}
+
+// Sequences of length 4 are overlong if the leading 5 bits (noted as y)
+// are equal to 0: 11110.yyy 10.yyxxxx 10.xxxxxx 10.xxxxxx
+bool overlong_4(uint8_t* str)
+{
+  return (str[0] & 0x07) == 0 && (str[1] & 0x30) == 0;
+}
+
+bool valid_utf8(uint8_t* str, size_t length)
+{
+  uint8_t* end = str + length;
+  while (str < end) {
+    if (str[0] < 0x80) {
+      // 0.xxxxxxx
+      str += 1;
+    } else if ((str[0] & 0xe0) == 0xc0) {
+      // 110.xxxxx 10.xxxxxx
+      if (str + 1 >= end) return false;
+      if (!continuation(str + 1, 1)) return false;
+      if (overlong_2(str)) return false;
+      str += 2;
+    } else if ((str[0] & 0xf0) == 0xe0) {
+      // 1110.xxxx 10.xxxxxx 10.xxxxxx
+      if (str + 2 >= end) return false;
+      if (!continuation(str + 1, 2)) return false;
+      if (overlong_3(str)) return false;
+      if (surrogate(str)) return false;
+      str += 3;
+    } else if ((str[0] & 0xf8) == 0xf0) {
+      // 11110.xxx 10.xxxxxx 10.xxxxxx 10.xxxxxx
+      if (str + 3 >= end) return false;
+      if (!continuation(str + 1, 3)) return false;
+      if (overlong_4(str)) return false;
+      if (invalid(str)) return false;
+      str += 4;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+int main(int argc, char** argv)
+{
+  std::ios_base::sync_with_stdio(false);
+  for (std::string line; std::getline(std::cin, line);) {
+    if (valid_utf8((uint8_t*) line.data(), line.length())) {
+      std::cout << line << std::endl;
+    }
+  }
+  return 0;
+}

+ 30 - 0
crawl/process_wet_file.sh

@@ -0,0 +1,30 @@
+#!/bin/usr/env sh
+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+URL=$1
+
+FILENAME=$(basename --suffix=".warc.wet.gz" "${URL}")
+
+echo "Processing ${FILENAME}."
+
+wget -q -P tmp "${URL}"
+
+#echo "Extracting ${FILENAME}.warc.wet.gz"
+gunzip "tmp/${FILENAME}.warc.wet.gz"
+
+#echo "Language identification for ${FILENAME}.warc.wet"
+fastText/fasttext predict-prob lid.176.bin "tmp/${FILENAME}.warc.wet" > "tmp/${FILENAME}.lid"
+
+#echo "Splitting ${FILENAME}.warc.wet per language"
+paste "tmp/${FILENAME}.lid" "tmp/${FILENAME}.warc.wet" | \
+    awk '($2 > 0.8 || ($1=="__label__hr" && $2 > 0.4)) && length() > 100 {lang = substr($1, 10); $1=""; $2=""; print $0 >> "shard/"lang".txt"}'
+
+#echo "Removing tmp files"
+rm "tmp/${FILENAME}.lid"
+rm "tmp/${FILENAME}.warc.wet"