fetch_test_data.sh 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. #!/usr/bin/env bash
  2. #
  3. # Copyright (c) 2016-present, Facebook, Inc.
  4. # All rights reserved.
  5. #
  6. # This source code is licensed under the MIT license found in the
  7. # LICENSE file in the root directory of this source tree.
  8. #
  9. DATADIR=${DATADIR:-data}
  10. report_error() {
  11. echo "Error on line $1 of $0"
  12. }
  13. myshuf() {
  14. perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
  15. }
  16. normalize_text() {
  17. tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
  18. sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
  19. -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
  20. -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
  21. }
  22. set -e
  23. trap 'report_error $LINENO' ERR
  24. mkdir -p "${DATADIR}"
  25. # Unsupervised datasets
  26. data_result="${DATADIR}/rw_queries.txt"
  27. if [ ! -f "$data_result" ]
  28. then
  29. cut -f 1,2 "${DATADIR}"/rw/rw.txt | awk '{print tolower($0)}' | tr '\t' '\n' > "$data_result" || rm -f "$data_result"
  30. fi
  31. data_result="${DATADIR}/enwik9.zip"
  32. if [ ! -f "$data_result" ] || \
  33. [ $(md5sum "$data_result" | cut -f 1 -d ' ') != "3e773f8a1577fda2e27f871ca17f31fd" ]
  34. then
  35. wget -c http://mattmahoney.net/dc/enwik9.zip -P "${DATADIR}" || rm -f "$data_result"
  36. unzip "$data_result" -d "${DATADIR}" || rm -f "$data_result"
  37. fi
  38. data_result="${DATADIR}/fil9"
  39. if [ ! -f "$data_result" ]
  40. then
  41. perl wikifil.pl "${DATADIR}/enwik9" > "$data_result" || rm -f "$data_result"
  42. fi
  43. data_result="${DATADIR}/rw/rw.txt"
  44. if [ ! -f "$data_result" ]
  45. then
  46. wget -c https://nlp.stanford.edu/~lmthang/morphoNLM/rw.zip -P "${DATADIR}"
  47. unzip "${DATADIR}/rw.zip" -d "${DATADIR}" || rm -f "$data_result"
  48. fi
  49. # Supervised datasets
  50. # Each datasets comes with a .train and a .test to measure performance
  51. echo "Downloading dataset dbpedia"
  52. data_result="${DATADIR}/dbpedia_csv.tar.gz"
  53. if [ ! -f "$data_result" ] || \
  54. [ $(md5sum "$data_result" | cut -f 1 -d ' ') != "8139d58cf075c7f70d085358e73af9b3" ]
  55. then
  56. wget -c "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz" -O "$data_result"
  57. wget -c "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k" -O "$data_result"
  58. tar -xzvf "$data_result" -C "${DATADIR}"
  59. fi
  60. data_result="${DATADIR}/dbpedia.train"
  61. if [ ! -f "$data_result" ]
  62. then
  63. cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "$data_result" || rm -f "$data_result"
  64. fi
  65. data_result="${DATADIR}/dbpedia.test"
  66. if [ ! -f "$data_result" ]
  67. then
  68. cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "$data_result" || rm -f "$data_result"
  69. fi
  70. echo "Downloading dataset tatoeba for langid"
  71. data_result="${DATADIR}"/langid/all.txt
  72. if [ ! -f "$data_result" ]
  73. then
  74. mkdir -p "${DATADIR}"/langid
  75. wget http://downloads.tatoeba.org/exports/sentences.tar.bz2 -O "${DATADIR}"/langid/sentences.tar.bz2
  76. tar xvfj "${DATADIR}"/langid/sentences.tar.bz2 --directory "${DATADIR}"/langid || exit 1
  77. awk -F"\t" '{print"__label__"$2" "$3}' < "${DATADIR}"/langid/sentences.csv | shuf > "$data_result"
  78. fi
  79. data_result="${DATADIR}/langid.train"
  80. if [ ! -f "$data_result" ]
  81. then
  82. tail -n +10001 "${DATADIR}"/langid/all.txt > "$data_result"
  83. fi
  84. data_result="${DATADIR}/langid.valid"
  85. if [ ! -f "$data_result" ]
  86. then
  87. head -n 10000 "${DATADIR}"/langid/all.txt > "$data_result"
  88. fi
  89. echo "Downloading cooking dataset"
  90. data_result="${DATADIR}"/cooking/cooking.stackexchange.txt
  91. if [ ! -f "$data_result" ]
  92. then
  93. mkdir -p "${DATADIR}"/cooking/
  94. wget https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz -O "${DATADIR}"/cooking/cooking.stackexchange.tar.gz
  95. tar xvzf "${DATADIR}"/cooking/cooking.stackexchange.tar.gz --directory "${DATADIR}"/cooking || exit 1
  96. cat "${DATADIR}"/cooking/cooking.stackexchange.txt | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" > "${DATADIR}"/cooking/cooking.preprocessed.txt
  97. fi
  98. data_result="${DATADIR}"/cooking.train
  99. if [ ! -f "$data_result" ]
  100. then
  101. head -n 12404 "${DATADIR}"/cooking/cooking.preprocessed.txt > "${DATADIR}"/cooking.train
  102. fi
  103. data_result="${DATADIR}"/cooking.valid
  104. if [ ! -f "$data_result" ]
  105. then
  106. tail -n 3000 "${DATADIR}"/cooking/cooking.preprocessed.txt > "${DATADIR}"/cooking.valid
  107. fi
  108. echo "Checking for YFCC100M"
  109. data_result="${DATADIR}"/YFCC100M/train
  110. if [ ! -f "$data_result" ]
  111. then
  112. echo 'Download YFCC100M, unpack it and place train into the following path: '"$data_result"
  113. echo 'You can download YFCC100M at :'"https://fasttext.cc/docs/en/dataset.html"
  114. echo 'After you download this, run the script again'
  115. exit 1
  116. fi
  117. data_result="${DATADIR}"/YFCC100M/test
  118. if [ ! -f "$data_result" ]
  119. then
  120. echo 'Download YFCC100M, unpack it and place test into the following path: '"$data_result"
  121. echo 'You can download YFCC100M at :'"https://fasttext.cc/docs/en/dataset.html"
  122. echo 'After you download this, run the script again'
  123. exit 1
  124. fi
  125. DATASET=(
  126. ag_news
  127. sogou_news
  128. dbpedia
  129. yelp_review_polarity
  130. yelp_review_full
  131. yahoo_answers
  132. amazon_review_full
  133. amazon_review_polarity
  134. )
  135. ID=(
  136. 0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news
  137. 0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news
  138. 0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia
  139. 0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity
  140. 0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full
  141. 0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
  142. 0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full
  143. 0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity
  144. )
  145. # Small datasets first
  146. for i in {0..0}
  147. do
  148. echo "Downloading dataset ${DATASET[i]}"
  149. if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
  150. then
  151. wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
  152. tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
  153. cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
  154. cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
  155. fi
  156. done
  157. # Large datasets require a bit more work due to the extra request page
  158. for i in {1..7}
  159. do
  160. echo "Downloading dataset ${DATASET[i]}"
  161. if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
  162. then
  163. curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
  164. curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&amp;/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
  165. tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
  166. cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
  167. cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
  168. fi
  169. done