getdata.sh 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. # BSD 3-Clause License
  2. #
  3. # Copyright (c) 2017,
  4. # All rights reserved.
  5. #
  6. # Redistribution and use in source and binary forms, with or without
  7. # modification, are permitted provided that the following conditions are met:
  8. #
  9. # * Redistributions of source code must retain the above copyright notice, this
  10. # list of conditions and the following disclaimer.
  11. #
  12. # * Redistributions in binary form must reproduce the above copyright notice,
  13. # this list of conditions and the following disclaimer in the documentation
  14. # and/or other materials provided with the distribution.
  15. #
  16. # * Neither the name of the copyright holder nor the names of its
  17. # contributors may be used to endorse or promote products derived from
  18. # this software without specific prior written permission.
  19. #
  20. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  23. # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  24. # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  26. # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  27. # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  28. # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. echo "=== Acquiring datasets ==="
  31. echo "---"
  32. mkdir -p data
  33. cd data
  34. if [[ ! -d 'wikitext-2' ]]; then
  35. echo "- Downloading WikiText-2 (WT2)"
  36. wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
  37. unzip -q wikitext-2-v1.zip
  38. cd wikitext-2
  39. mv wiki.train.tokens train.txt
  40. mv wiki.valid.tokens valid.txt
  41. mv wiki.test.tokens test.txt
  42. cd ..
  43. fi
  44. echo "- Downloading WikiText-103 (WT2)"
  45. if [[ ! -d 'wikitext-103' ]]; then
  46. wget --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
  47. unzip -q wikitext-103-v1.zip
  48. cd wikitext-103
  49. mv wiki.train.tokens train.txt
  50. mv wiki.valid.tokens valid.txt
  51. mv wiki.test.tokens test.txt
  52. cd ..
  53. fi
  54. echo "- Downloading enwik8 (Character)"
  55. if [[ ! -d 'enwik8' ]]; then
  56. mkdir -p enwik8
  57. cd enwik8
  58. wget --continue http://mattmahoney.net/dc/enwik8.zip
  59. wget https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py
  60. python3 prep_enwik8.py
  61. cd ..
  62. fi
  63. echo "- Downloading text8 (Character)"
  64. if [[ ! -d 'text8' ]]; then
  65. mkdir -p text8
  66. cd text8
  67. wget --continue http://mattmahoney.net/dc/text8.zip
  68. python ../../prep_text8.py
  69. cd ..
  70. fi
  71. echo "- Downloading Penn Treebank (PTB)"
  72. if [[ ! -d 'penn' ]]; then
  73. wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
  74. tar -xzf simple-examples.tgz
  75. mkdir -p penn
  76. cd penn
  77. mv ../simple-examples/data/ptb.train.txt train.txt
  78. mv ../simple-examples/data/ptb.test.txt test.txt
  79. mv ../simple-examples/data/ptb.valid.txt valid.txt
  80. cd ..
  81. echo "- Downloading Penn Treebank (Character)"
  82. mkdir -p pennchar
  83. cd pennchar
  84. mv ../simple-examples/data/ptb.char.train.txt train.txt
  85. mv ../simple-examples/data/ptb.char.test.txt test.txt
  86. mv ../simple-examples/data/ptb.char.valid.txt valid.txt
  87. cd ..
  88. rm -rf simple-examples/
  89. fi
  90. echo "- Downloading 1B words"
  91. if [[ ! -d 'one-billion-words' ]]; then
  92. mkdir -p one-billion-words
  93. cd one-billion-words
  94. wget --no-proxy http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
  95. tar xzvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
  96. path="1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/"
  97. cat ${path}/news.en.heldout-00000-of-00050 > valid.txt
  98. cat ${path}/news.en.heldout-00000-of-00050 > test.txt
  99. wget https://github.com/rafaljozefowicz/lm/raw/master/1b_word_vocab.txt
  100. cd ..
  101. fi
  102. echo "---"
  103. echo "Happy language modeling :)"