| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120 |
- # BSD 3-Clause License
- #
- # Copyright (c) 2017,
- # All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # * Redistributions of source code must retain the above copyright notice, this
- # list of conditions and the following disclaimer.
- #
- # * Redistributions in binary form must reproduce the above copyright notice,
- # this list of conditions and the following disclaimer in the documentation
- # and/or other materials provided with the distribution.
- #
- # * Neither the name of the copyright holder nor the names of its
- # contributors may be used to endorse or promote products derived from
- # this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- echo "=== Acquiring datasets ==="
- echo "---"
- mkdir -p data
- cd data
- if [[ ! -d 'wikitext-2' ]]; then
- echo "- Downloading WikiText-2 (WT2)"
- wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
- unzip -q wikitext-2-v1.zip
- cd wikitext-2
- mv wiki.train.tokens train.txt
- mv wiki.valid.tokens valid.txt
- mv wiki.test.tokens test.txt
- cd ..
- fi
- echo "- Downloading WikiText-103 (WT2)"
- if [[ ! -d 'wikitext-103' ]]; then
- wget --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
- unzip -q wikitext-103-v1.zip
- cd wikitext-103
- mv wiki.train.tokens train.txt
- mv wiki.valid.tokens valid.txt
- mv wiki.test.tokens test.txt
- cd ..
- fi
- echo "- Downloading enwik8 (Character)"
- if [[ ! -d 'enwik8' ]]; then
- mkdir -p enwik8
- cd enwik8
- wget --continue http://mattmahoney.net/dc/enwik8.zip
- wget https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py
- python3 prep_enwik8.py
- cd ..
- fi
- echo "- Downloading text8 (Character)"
- if [[ ! -d 'text8' ]]; then
- mkdir -p text8
- cd text8
- wget --continue http://mattmahoney.net/dc/text8.zip
- python ../../prep_text8.py
- cd ..
- fi
- echo "- Downloading Penn Treebank (PTB)"
- if [[ ! -d 'penn' ]]; then
- wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
- tar -xzf simple-examples.tgz
- mkdir -p penn
- cd penn
- mv ../simple-examples/data/ptb.train.txt train.txt
- mv ../simple-examples/data/ptb.test.txt test.txt
- mv ../simple-examples/data/ptb.valid.txt valid.txt
- cd ..
- echo "- Downloading Penn Treebank (Character)"
- mkdir -p pennchar
- cd pennchar
- mv ../simple-examples/data/ptb.char.train.txt train.txt
- mv ../simple-examples/data/ptb.char.test.txt test.txt
- mv ../simple-examples/data/ptb.char.valid.txt valid.txt
- cd ..
- rm -rf simple-examples/
- fi
- echo "- Downloading 1B words"
- if [[ ! -d 'one-billion-words' ]]; then
- mkdir -p one-billion-words
- cd one-billion-words
- wget --no-proxy http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
- tar xzvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
- path="1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/"
- cat ${path}/news.en.heldout-00000-of-00050 > valid.txt
- cat ${path}/news.en.heldout-00000-of-00050 > test.txt
- wget https://github.com/rafaljozefowicz/lm/raw/master/1b_word_vocab.txt
- cd ..
- fi
- echo "---"
- echo "Happy language modeling :)"
|