JasonWang
/
fastText
Mirror von https://github.com/facebookresearch/fastText


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
							#!/usr/bin/env bash
#
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#

DATADIR=${DATADIR:-data}

report_error() {
   echo "Error on line $1 of $0"
}

myshuf() {
  perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
}

normalize_text() {
  tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
    sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
        -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
        -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
}

set -e
trap 'report_error $LINENO' ERR

mkdir -p "${DATADIR}"


# Unsupervised datasets

data_result="${DATADIR}/rw_queries.txt"
if [ ! -f "$data_result" ]
then
  cut -f 1,2 "${DATADIR}"/rw/rw.txt | awk '{print tolower($0)}' | tr '\t' '\n' > "$data_result" || rm -f "$data_result"
fi

data_result="${DATADIR}/enwik9.zip"
if [ ! -f "$data_result" ] || \
   [ $(md5sum "$data_result" | cut -f 1 -d ' ') != "3e773f8a1577fda2e27f871ca17f31fd" ]
then
  wget -c http://mattmahoney.net/dc/enwik9.zip -P "${DATADIR}" || rm -f "$data_result"
  unzip "$data_result" -d "${DATADIR}" || rm -f "$data_result"
fi

data_result="${DATADIR}/fil9"
if [ ! -f "$data_result" ]
then
  perl wikifil.pl "${DATADIR}/enwik9" > "$data_result" || rm -f "$data_result"
fi

data_result="${DATADIR}/rw/rw.txt"
if [ ! -f "$data_result" ]
then
  wget -c https://nlp.stanford.edu/~lmthang/morphoNLM/rw.zip -P "${DATADIR}"
  unzip "${DATADIR}/rw.zip" -d "${DATADIR}" || rm -f "$data_result"
fi

# Supervised datasets
# Each datasets comes with a .train and a .test to measure performance

echo "Downloading dataset dbpedia"

data_result="${DATADIR}/dbpedia_csv.tar.gz"
if [ ! -f "$data_result" ] || \
   [ $(md5sum "$data_result" | cut -f 1 -d ' ') != "8139d58cf075c7f70d085358e73af9b3" ]
then
  wget -c "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz" -O "$data_result"
  tar -xzvf "$data_result" -C "${DATADIR}"
fi

data_result="${DATADIR}/dbpedia.train"
if [ ! -f "$data_result" ]
then
  cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "$data_result" || rm -f "$data_result"
fi

data_result="${DATADIR}/dbpedia.test"
if [ ! -f "$data_result" ]
then
  cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "$data_result" || rm -f "$data_result"
fi

echo "Downloading dataset tatoeba for langid"

data_result="${DATADIR}"/langid/all.txt
if [ ! -f "$data_result" ]
then
  mkdir -p "${DATADIR}"/langid
  wget http://downloads.tatoeba.org/exports/sentences.tar.bz2 -O "${DATADIR}"/langid/sentences.tar.bz2
  tar xvfj "${DATADIR}"/langid/sentences.tar.bz2 --directory "${DATADIR}"/langid || exit 1
  awk -F"\t" '{print"__label__"$2" "$3}' < "${DATADIR}"/langid/sentences.csv | shuf > "$data_result"
fi

data_result="${DATADIR}/langid.train"
if [ ! -f "$data_result" ]
then
  tail -n +10001 "${DATADIR}"/langid/all.txt > "$data_result"
fi

data_result="${DATADIR}/langid.valid"
if [ ! -f "$data_result" ]
then
  head -n 10000 "${DATADIR}"/langid/all.txt > "$data_result"
fi

echo "Downloading cooking dataset"

data_result="${DATADIR}"/cooking/cooking.stackexchange.txt
if [ ! -f "$data_result" ]
then
  mkdir -p "${DATADIR}"/cooking/
  wget https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz -O "${DATADIR}"/cooking/cooking.stackexchange.tar.gz
  tar xvzf "${DATADIR}"/cooking/cooking.stackexchange.tar.gz --directory "${DATADIR}"/cooking || exit 1
  cat "${DATADIR}"/cooking/cooking.stackexchange.txt | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" > "${DATADIR}"/cooking/cooking.preprocessed.txt
fi

data_result="${DATADIR}"/cooking.train
if [ ! -f "$data_result" ]
then
  head -n 12404 "${DATADIR}"/cooking/cooking.preprocessed.txt > "${DATADIR}"/cooking.train
fi

data_result="${DATADIR}"/cooking.valid
if [ ! -f "$data_result" ]
then
  tail -n 3000 "${DATADIR}"/cooking/cooking.preprocessed.txt > "${DATADIR}"/cooking.valid
fi

echo "Checking for YFCC100M"

data_result="${DATADIR}"/YFCC100M/train
if [ ! -f "$data_result" ]
then
  echo 'Download YFCC100M, unpack it and place train into the following path: '"$data_result"
  echo 'You can download YFCC100M at :'"https://fasttext.cc/docs/en/dataset.html"
  echo 'After you download this, run the script again'
  exit 1
fi

data_result="${DATADIR}"/YFCC100M/test
if [ ! -f "$data_result" ]
then
  echo 'Download YFCC100M, unpack it and place test into the following path: '"$data_result"
  echo 'You can download YFCC100M at :'"https://fasttext.cc/docs/en/dataset.html"
  echo 'After you download this, run the script again'
  exit 1
fi

DATASET=(
  ag_news
  sogou_news
  dbpedia
  yelp_review_polarity
  yelp_review_full
  yahoo_answers
  amazon_review_full
  amazon_review_polarity
)

ID=(
  0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news
  0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news
  0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia
  0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity
  0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full
  0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
  0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full
  0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity
)

# Small datasets first

for i in {0..0}
do
  echo "Downloading dataset ${DATASET[i]}"
  if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
  then
    wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
    tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
    cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
    cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
  fi
done

# Large datasets require a bit more work due to the extra request page

for i in {1..7}
do
  echo "Downloading dataset ${DATASET[i]}"
  if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
  then
    curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
    curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&amp;/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
    tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
    cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
    cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
  fi
done