Ver Fonte

[BERT/TF] Added multi-node support

Przemek Strzelczyk há 6 anos atrás
pai
commit
a98df279fe
78 ficheiros alterados com 4112 adições e 2033 exclusões
  1. 19 1
      TensorFlow/LanguageModeling/BERT/.dockerignore
  2. 6 1
      TensorFlow/LanguageModeling/BERT/.gitignore
  3. 8 5
      TensorFlow/LanguageModeling/BERT/Dockerfile
  4. 409 183
      TensorFlow/LanguageModeling/BERT/README.md
  5. 218 0
      TensorFlow/LanguageModeling/BERT/configurations.yml
  6. 26 0
      TensorFlow/LanguageModeling/BERT/data/BooksDownloader.py
  7. 32 0
      TensorFlow/LanguageModeling/BERT/data/BookscorpusTextFormatting.py
  8. 120 0
      TensorFlow/LanguageModeling/BERT/data/Downloader.py
  9. 109 0
      TensorFlow/LanguageModeling/BERT/data/GLUEDownloader.py
  10. 158 0
      TensorFlow/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py
  11. 27 0
      TensorFlow/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py
  12. 93 0
      TensorFlow/LanguageModeling/BERT/data/PubMedDownloader.py
  13. 44 0
      TensorFlow/LanguageModeling/BERT/data/PubMedTextFormatting.py
  14. 54 0
      TensorFlow/LanguageModeling/BERT/data/SquadDownloader.py
  15. 331 0
      TensorFlow/LanguageModeling/BERT/data/TextSharding.py
  16. 58 0
      TensorFlow/LanguageModeling/BERT/data/WikiDownloader.py
  17. 46 0
      TensorFlow/LanguageModeling/BERT/data/WikicorpusTextFormatting.py
  18. 12 0
      TensorFlow/LanguageModeling/BERT/data/__init__.py
  19. 389 0
      TensorFlow/LanguageModeling/BERT/data/bertPrep.py
  20. 0 15
      TensorFlow/LanguageModeling/BERT/data/bookcorpus/clean_and_merge_text.py
  21. 0 27
      TensorFlow/LanguageModeling/BERT/data/bookcorpus/config.sh
  22. 0 18
      TensorFlow/LanguageModeling/BERT/data/bookcorpus/create_pseudo_test_set.py
  23. 0 10
      TensorFlow/LanguageModeling/BERT/data/bookcorpus/create_pseudo_test_set.sh
  24. 0 23
      TensorFlow/LanguageModeling/BERT/data/bookcorpus/preprocessing.sh
  25. 0 28
      TensorFlow/LanguageModeling/BERT/data/bookcorpus/preprocessing_test_set.sh
  26. 0 12
      TensorFlow/LanguageModeling/BERT/data/bookcorpus/preprocessing_test_set_xargs_wrapper.sh
  27. 0 13
      TensorFlow/LanguageModeling/BERT/data/bookcorpus/preprocessing_xargs_wrapper.sh
  28. 0 28
      TensorFlow/LanguageModeling/BERT/data/bookcorpus/run_preprocessing.sh
  29. 0 20
      TensorFlow/LanguageModeling/BERT/data/bookcorpus/sentence_segmentation_nltk.py
  30. 0 41
      TensorFlow/LanguageModeling/BERT/data/bookcorpus/shard_text_input_file.py
  31. 46 0
      TensorFlow/LanguageModeling/BERT/data/create_datasets_from_start.sh
  32. 0 153
      TensorFlow/LanguageModeling/BERT/data/glue/download_glue_data.py
  33. 0 123
      TensorFlow/LanguageModeling/BERT/data/pretrained_models_google/download_models.py
  34. 0 60
      TensorFlow/LanguageModeling/BERT/data/squad/squad_download.sh
  35. 0 28
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/config.sh
  36. 0 18
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/create_pseudo_test_set.py
  37. 0 10
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/create_pseudo_test_set.sh
  38. 0 23
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/preprocessing.sh
  39. 0 28
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/preprocessing_test_set.sh
  40. 0 12
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/preprocessing_test_set_xargs_wrapper.sh
  41. 0 13
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/preprocessing_xargs_wrapper.sh
  42. 0 30
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/remove_tags_and_clean.py
  43. 0 49
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/run_preprocessing.sh
  44. 0 39
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/shard_text_input_file.py
  45. 0 20
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/wiki_sentence_segmentation_nltk.py
  46. 0 22
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/wiki_sentence_segmentation_spacy.py
  47. 0 33
      TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/wiki_sentence_segmentation_spacy_pipe.py
  48. 244 43
      TensorFlow/LanguageModeling/BERT/optimization.py
  49. 73 0
      TensorFlow/LanguageModeling/BERT/run.sub
  50. 32 27
      TensorFlow/LanguageModeling/BERT/run_classifier.py
  51. 124 69
      TensorFlow/LanguageModeling/BERT/run_pretraining.py
  52. 0 19
      TensorFlow/LanguageModeling/BERT/run_pretraining.sh
  53. 11 3
      TensorFlow/LanguageModeling/BERT/run_squad.py
  54. 13 0
      TensorFlow/LanguageModeling/BERT/run_squad_trtis_client.py
  55. 14 1
      TensorFlow/LanguageModeling/BERT/scripts/data_download.sh
  56. 0 17
      TensorFlow/LanguageModeling/BERT/scripts/data_download_helper.sh
  57. 16 5
      TensorFlow/LanguageModeling/BERT/scripts/finetune_inference_benchmark.sh
  58. 19 15
      TensorFlow/LanguageModeling/BERT/scripts/finetune_train_benchmark.sh
  59. 51 45
      TensorFlow/LanguageModeling/BERT/scripts/run_glue.sh
  60. 78 0
      TensorFlow/LanguageModeling/BERT/scripts/run_glue_inference.sh
  61. 0 102
      TensorFlow/LanguageModeling/BERT/scripts/run_pretraining.sh
  62. 111 0
      TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_adam.sh
  63. 60 0
      TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb.sh
  64. 103 0
      TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase1.sh
  65. 115 0
      TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase2.sh
  66. 51 51
      TensorFlow/LanguageModeling/BERT/scripts/run_squad.sh
  67. 31 23
      TensorFlow/LanguageModeling/BERT/scripts/run_squad_inference.sh
  68. 17 3
      TensorFlow/LanguageModeling/BERT/scripts/trtis/export_model.sh
  69. 17 2
      TensorFlow/LanguageModeling/BERT/scripts/trtis/generate_figures.sh
  70. 17 2
      TensorFlow/LanguageModeling/BERT/scripts/trtis/run_client.sh
  71. 13 1
      TensorFlow/LanguageModeling/BERT/scripts/trtis/run_perf_client.sh
  72. 18 3
      TensorFlow/LanguageModeling/BERT/scripts/trtis/run_trtis.sh
  73. 13 0
      TensorFlow/LanguageModeling/BERT/scripts/trtis/wait_for_trtis_server.sh
  74. 286 234
      TensorFlow/LanguageModeling/BERT/tokenization.py
  75. 13 0
      TensorFlow/LanguageModeling/BERT/utils/create_glue_data.py
  76. 341 282
      TensorFlow/LanguageModeling/BERT/utils/create_pretraining_data.py
  77. 13 0
      TensorFlow/LanguageModeling/BERT/utils/create_squad_data.py
  78. 13 0
      TensorFlow/LanguageModeling/BERT/utils/utils.py

+ 19 - 1
TensorFlow/LanguageModeling/BERT/.dockerignore

@@ -1,6 +1,24 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 .idea/
 .git/
 __pycache__/
 results/
-data/
+data/download
+data/extracted
+data/formatted_one_article_per_line
+data/sharded
+data/hdf5*
+data/tfrecord*
 checkpoints/

+ 6 - 1
TensorFlow/LanguageModeling/BERT/.gitignore

@@ -9,7 +9,12 @@ __pycache__/
 *.so
 
 #Data
-data/*/*/
+data/download
+data/extracted
+data/formatted_one_article_per_line
+data/sharded
+data/hdf5*
+data/tfrecord*
 data/*/*.zip
 
 #Resutls

+ 8 - 5
TensorFlow/LanguageModeling/BERT/Dockerfile

@@ -1,4 +1,4 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.06-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.08-py3
 
 FROM tensorrtserver_client as trt
 
@@ -12,16 +12,19 @@ WORKDIR /workspace
 RUN git clone https://github.com/openai/gradient-checkpointing.git
 RUN git clone https://github.com/attardi/wikiextractor.git
 RUN git clone https://github.com/soskek/bookcorpus.git
+RUN git clone https://github.com/titipata/pubmed_parser
 
-# Copy the perf_client over
+RUN pip3 install /workspace/pubmed_parser
+
+#Copy the perf_client over
 COPY --from=trt /workspace/build/perf_client /workspace/build/perf_client
 
-# Copy the python wheel and install with pip
+#Copy the python wheel and install with pip
 COPY --from=trt /workspace/build/dist/dist/tensorrtserver*.whl /tmp/
 RUN pip install /tmp/tensorrtserver*.whl && rm /tmp/tensorrtserver*.whl
 
-
 WORKDIR /workspace/bert
 COPY . .
 
-ENV PYTHONPATH=/workspace/bert
+ENV PYTHONPATH /workspace/bert
+ENV BERT_PREP_WORKING_DIR /workspace/bert/data

Diff do ficheiro suprimidas por serem muito extensas
+ 409 - 183
TensorFlow/LanguageModeling/BERT/README.md


+ 218 - 0
TensorFlow/LanguageModeling/BERT/configurations.yml

@@ -0,0 +1,218 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#1 DGX1 phase1
+bert--DGX1:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "1"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "7.5e-4"
+    NUM_ACCUMULATION_STEPS: "1024"
+    PHASE: "1"
+
+#4 DGX1 phase1
+bert--DGX1_n4:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "4"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "1.875e-4"
+    NUM_ACCUMULATION_STEPS: "256"
+    PHASE: "1"
+
+#16 DGX1 phase1
+bert--DGX1_n16:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "16"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "4.6875e-5"
+    NUM_ACCUMULATION_STEPS: "64"
+    PHASE: "1"
+
+#32 DGX1 phase1
+bert--DGX1_n32:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "32"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "2.34375e-5"
+    NUM_ACCUMULATION_STEPS: "32"
+    PHASE: "1"
+
+#1 DGX2 phase1
+bert--DGX2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "1"
+    BATCHSIZE: "32"
+    LEARNING_RATE: "3.75e-4"
+    NUM_ACCUMULATION_STEPS: "128"
+    PHASE: "1"
+
+#4 DGX2 phase1
+bert--DGX2_n4:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "4"
+    BATCHSIZE: "32"
+    LEARNING_RATE: "9.375e-5"
+    NUM_ACCUMULATION_STEPS: "32"
+    PHASE: "1"
+
+#16 DGX2 phase1
+bert--DGX2_n16:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "16"
+    BATCHSIZE: "256"
+    LEARNING_RATE: "3.75e-4"
+    NUM_ACCUMULATION_STEPS: "4"
+    PHASE: "1"
+
+#32 DGX2 phase1
+bert--DGX2_n32:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "32"
+    BATCHSIZE: "32"
+    LEARNING_RATE: "2.34375e-5"
+    NUM_ACCUMULATION_STEPS: "8"
+    PHASE: "1"
+
+#64 DGX2 phase1
+bert--DGX2_n64:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "32"
+    BATCHSIZE: "32"
+    LEARNING_RATE: "1.171875e-5"
+    NUM_ACCUMULATION_STEPS: "4"
+    PHASE: "1"
+
+#1 DGX1 phase2
+bert--DGX1_n1p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "1"
+    BATCHSIZE: "2"
+    LEARNING_RATE: "5e-4"
+    NUM_ACCUMULATION_STEPS: "4096"
+    PHASE: "2"
+
+#4 DGX1 phase2
+bert--DGX1_n4p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "4"
+    BATCHSIZE: "2"
+    LEARNING_RATE: "1.25e-4"
+    NUM_ACCUMULATION_STEPS: "512"
+    PHASE: "2"
+
+#16 DGX1 phase2
+bert--DGX1_n16p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "16"
+    BATCHSIZE: "2"
+    LEARNING_RATE: "1.5625e-5"
+    NUM_ACCUMULATION_STEPS: "128"
+    PHASE: "2"
+
+#32 DGX1 phase2
+bert--DGX1_n32p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "32"
+    BATCHSIZE: "2"
+    LEARNING_RATE: "1.5625e-5"
+    NUM_ACCUMULATION_STEPS: "64"
+    PHASE: "2"
+
+#1 DGX2 phase2
+bert--DGX2_n1p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "1"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "2.5e-5"
+    NUM_ACCUMULATION_STEPS: "256"
+    PHASE: "2"
+
+#4 DGX2 phase2
+bert--DGX2_n4p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "4"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "6.25e-5"
+    NUM_ACCUMULATION_STEPS: "64"
+    PHASE: "2"
+
+#16 DGX2 phase2
+bert--DGX2_n16p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "16"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "1.5625e-5"
+    NUM_ACCUMULATION_STEPS: "16"
+    PHASE: "2"
+
+#32 DGX2 phase2
+bert--DGX2_n32p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "32"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "7.8125e-6"
+    NUM_ACCUMULATION_STEPS: "8"
+    PHASE: "2"
+

+ 26 - 0
TensorFlow/LanguageModeling/BERT/data/BooksDownloader.py

@@ -0,0 +1,26 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+
+class BooksDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path
+        pass
+
+
+    def download(self):
+        bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
+        bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
+        bookscorpus_download_command += ' --trash-bad-count'
+        bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)

+ 32 - 0
TensorFlow/LanguageModeling/BERT/data/BookscorpusTextFormatting.py

@@ -0,0 +1,32 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+class BookscorpusTextFormatting:
+    def __init__(self, books_path, output_filename, recursive = False):
+        self.books_path = books_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one book per line
+    def merge(self):
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
+                with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
+                    for line in file:
+                        if line.strip() != '':
+                            ofile.write(line.strip() + ' ')
+                ofile.write("\n\n")

+ 120 - 0
TensorFlow/LanguageModeling/BERT/data/Downloader.py

@@ -0,0 +1,120 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
+from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
+from WikiDownloader import WikiDownloader
+from BooksDownloader import BooksDownloader
+from GLUEDownloader import GLUEDownloader
+from SquadDownloader import SquadDownloader
+from PubMedDownloader import PubMedDownloader
+
+class Downloader:
+    def __init__(self, dataset_name, save_path):
+        self.dataset_name = dataset_name
+        self.save_path = save_path
+
+
+    def download(self):
+        if self.dataset_name == 'bookscorpus':
+            self.download_bookscorpus()
+
+        elif self.dataset_name == 'wikicorpus_en':
+            self.download_wikicorpus('en')
+
+        elif self.dataset_name == 'wikicorpus_zh':
+            self.download_wikicorpus('zh')
+
+        elif self.dataset_name == 'pubmed_baseline':
+            self.download_pubmed('baseline')
+
+        elif self.dataset_name == 'pubmed_daily_update':
+            self.download_pubmed('daily_update')
+
+        elif self.dataset_name == 'pubmed_fulltext':
+            self.download_pubmed('fulltext')
+
+        elif self.dataset_name == 'pubmed_open_access':
+            self.download_pubmed('open_access')
+
+        elif self.dataset_name == 'google_pretrained_weights':
+            self.download_google_pretrained_weights()
+
+        elif self.dataset_name == 'nvidia_pretrained_weights':
+            self.download_nvidia_pretrained_weights()
+
+        elif self.dataset_name == 'MRPC':
+            self.download_glue(self.dataset_name)
+
+        elif self.dataset_name == 'MNLI':
+            self.download_glue(self.dataset_name)
+
+        elif self.dataset_name == 'CoLA':
+            self.download_glue(self.dataset_name)
+
+        elif self.dataset_name == 'squad':
+            self.download_squad()
+
+        elif self.dataset_name == 'all':
+            self.download_bookscorpus()
+            self.download_wikicorpus('en')
+            self.download_wikicorpus('zh')
+            self.download_pubmed('baseline')
+            self.download_pubmed('daily_update')
+            self.download_pubmed('fulltext')
+            self.download_pubmed('open_access')
+            self.download_google_pretrained_weights()
+            self.download_nvidia_pretrained_weights()
+            self.download_glue("CoLA")
+            self.download_glue("MNLI")
+            self.download_glue("MRPC")
+            self.download_squad()
+
+        else:
+            print(self.dataset_name)
+            assert False, 'Unknown dataset_name provided to downloader'
+
+
+    def download_bookscorpus(self):
+        downloader = BooksDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_wikicorpus(self, language):
+        downloader = WikiDownloader(language, self.save_path)
+        downloader.download()
+
+
+    def download_pubmed(self, subset):
+        downloader = PubMedDownloader(subset, self.save_path)
+        downloader.download()
+
+
+    def download_google_pretrained_weights(self):
+        downloader = GooglePretrainedWeightDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_nvidia_pretrained_weights(self):
+        downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_glue(self, glue_task_name):
+        downloader = GLUEDownloader(glue_task_name, self.save_path)
+        downloader.download()
+
+
+    def download_squad(self):
+        downloader = SquadDownloader(self.save_path)
+        downloader.download()

+ 109 - 0
TensorFlow/LanguageModeling/BERT/data/GLUEDownloader.py

@@ -0,0 +1,109 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib
+import sys
+import zipfile
+import io
+
+URLLIB=urllib
+if sys.version_info >= (3, 0):
+    URLLIB=urllib.request
+
+class GLUEDownloader:
+    def __init__(self, task, save_path):
+
+        # Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
+
+        self.TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
+                     "SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
+                     "MRPC":{"mrpc_dev": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
+                            "mrpc_train": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt',
+                            "mrpc_test": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'},
+                     "QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
+                     "STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
+                     "MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
+                     "SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
+                     "QNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLI.zip?alt=media&token=c24cad61-f2df-4f04-9ab6-aa576fa829d0',
+                     "RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
+                     "WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
+                     "diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
+
+
+        self.save_path = save_path
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.task = task
+
+    def download(self):
+
+        if self.task == 'MRPC':
+            self.download_mrpc()
+        elif self.task == 'diagnostic':
+            self.download_diagnostic()
+        else:
+            self.download_and_extract(self.task)
+
+    def download_and_extract(self, task):
+        print("Downloading and extracting %s..." % task)
+        data_file = "%s.zip" % task
+        URLLIB.urlretrieve(self.TASK2PATH[task], data_file)
+        print(data_file,"\n\n\n")
+        with zipfile.ZipFile(data_file) as zip_ref:
+            zip_ref.extractall(self.save_path)
+        os.remove(data_file)
+        print("\tCompleted!")
+
+    def download_mrpc(self):
+        print("Processing MRPC...")
+        mrpc_dir = os.path.join(self.save_path, "MRPC")
+        if not os.path.isdir(mrpc_dir):
+            os.mkdir(mrpc_dir)
+
+        mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
+        mrpc_dev_file = os.path.join(mrpc_dir, "dev_ids.tsv")
+        mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
+
+        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_train"], mrpc_train_file)
+        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_test"], mrpc_test_file)
+        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_dev"], mrpc_dev_file)
+
+        dev_ids = []
+        with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
+            for row in ids_fh:
+                dev_ids.append(row.strip().split('\t'))
+
+        with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
+                io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
+                io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
+            header = data_fh.readline()
+            train_fh.write(header)
+            dev_fh.write(header)
+            for row in data_fh:
+                label, id1, id2, s1, s2 = row.strip().split('\t')
+                if [id1, id2] in dev_ids:
+                    dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+                else:
+                    train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+
+        with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
+                io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
+            header = data_fh.readline()
+            test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+            for idx, row in enumerate(data_fh):
+                label, id1, id2, s1, s2 = row.strip().split('\t')
+                test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+        print("\tCompleted!")

+ 158 - 0
TensorFlow/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py

@@ -0,0 +1,158 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+import os
+import urllib.request
+import zipfile
+
+class GooglePretrainedWeightDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/google_pretrained_weights'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        # Download urls
+        self.model_urls = {
+            'bert_base_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
+            'bert_large_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
+            'bert_base_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
+            'bert_large_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
+            'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
+            'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
+            'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
+        }
+
+        # SHA256sum verification for file download integrity (and checking for changes from the download source over time)
+        self.bert_base_uncased_sha = {
+            'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
+            'bert_model.ckpt.data-00000-of-00001': '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
+            'bert_model.ckpt.index': '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
+            'bert_model.ckpt.meta': 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
+            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+        }
+
+        self.bert_large_uncased_sha = {
+            'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
+            'bert_model.ckpt.data-00000-of-00001': 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
+            'bert_model.ckpt.index': '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
+            'bert_model.ckpt.meta': '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
+            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+        }
+
+        self.bert_base_cased_sha = {
+            'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
+            'bert_model.ckpt.data-00000-of-00001': '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
+            'bert_model.ckpt.index': '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
+            'bert_model.ckpt.meta': '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
+            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+        }
+
+        self.bert_large_cased_sha = {
+            'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
+            'bert_model.ckpt.data-00000-of-00001': '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
+            'bert_model.ckpt.index': 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
+            'bert_model.ckpt.meta': 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
+            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+        }
+
+        self.bert_base_multilingual_cased_sha = {
+            'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
+            'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
+            'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
+            'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
+            'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
+        }
+
+        self.bert_large_multilingual_uncased_sha = {
+            'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
+            'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
+            'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
+            'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
+            'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
+        }
+
+        self.bert_base_chinese_sha = {
+            'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
+            'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
+            'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
+            'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
+            'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
+        }
+
+        # Relate SHA to urls for loop below
+        self.model_sha = {
+            'bert_base_uncased': self.bert_base_uncased_sha,
+            'bert_large_uncased': self.bert_large_uncased_sha,
+            'bert_base_cased': self.bert_base_cased_sha,
+            'bert_large_cased': self.bert_large_cased_sha,
+            'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
+            'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
+            'bert_base_chinese': self.bert_base_chinese_sha
+        }
+
+    # Helper to get sha256sum of a file
+    def sha256sum(self, filename):
+      h  = hashlib.sha256()
+      b  = bytearray(128*1024)
+      mv = memoryview(b)
+      with open(filename, 'rb', buffering=0) as f:
+        for n in iter(lambda : f.readinto(mv), 0):
+          h.update(mv[:n])
+
+      return h.hexdigest()
+
+    def download(self):
+        # Iterate over urls: download, unzip, verify sha256sum
+        found_mismatch_sha = False
+        for model in self.model_urls:
+          url = self.model_urls[model][0]
+          file = self.save_path + '/' + self.model_urls[model][1]
+
+          print('Downloading', url)
+          response = urllib.request.urlopen(url)
+          with open(file, 'wb') as handle:
+            handle.write(response.read())
+
+          print('Unzipping', file)
+          zip = zipfile.ZipFile(file, 'r')
+          zip.extractall(self.save_path)
+          zip.close()
+
+          sha_dict = self.model_sha[model]
+          for extracted_file in sha_dict:
+            sha = sha_dict[extracted_file]
+            if sha != self.sha256sum(file[:-4] + '/' + extracted_file):
+              found_mismatch_sha = True
+              print('SHA256sum does not match on file:', extracted_file, 'from download url:', url)
+            else:
+              print(file[:-4] + '/' + extracted_file, '\t', 'verified')
+
+        if not found_mismatch_sha:
+          print("All downloads pass sha256sum verification.")
+
+    def serialize(self):
+        pass
+
+    def deserialize(self):
+        pass
+
+    def listAvailableWeights(self):
+        print("Available Weight Datasets")
+        for item in self.model_urls:
+            print(item)
+
+    def listLocallyStoredWeights(self):
+        pass
+

+ 27 - 0
TensorFlow/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py

@@ -0,0 +1,27 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+class NVIDIAPretrainedWeightDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/nvidia_pretrained_weights'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        pass
+
+
+    def download(self):
+        assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'

+ 93 - 0
TensorFlow/LanguageModeling/BERT/data/PubMedDownloader.py

@@ -0,0 +1,93 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import glob
+import gzip
+import os
+import urllib.request
+import shutil
+import sys
+
+class PubMedDownloader:
+    def __init__(self, subset, save_path):
+        self.subset = subset
+        # Modifying self.save_path in two steps to handle creation of subdirectories
+        self.save_path = save_path + '/pubmed' + '/'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.save_path = self.save_path + '/' + subset
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.download_urls = {
+            'baseline' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/',
+            'daily_update' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/',
+            'fulltext' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/',
+            'open_access' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/'
+        }
+
+
+    def download(self):
+        print('subset:', self.subset)
+        url = self.download_urls[self.subset]
+        self.download_files(url)
+        self.extract_files()
+
+
+    def download_files(self, url):
+        url = self.download_urls[self.subset]
+        output = os.popen('curl ' + url).read()
+
+        if self.subset == 'fulltext' or self.subset == 'open_access':
+            line_split = 'comm_use' if self.subset == 'fulltext' else 'non_comm_use'
+            for line in output.splitlines():
+                if line[-10:] == 'xml.tar.gz' and \
+                        line.split(' ')[-1].split('.')[0] == line_split:
+                    file = os.path.join(self.save_path, line.split(' ')[-1])
+                    if not os.path.isfile(file):
+                        print('Downloading', file)
+                        response = urllib.request.urlopen(url + line.split(' ')[-1])
+                        with open(file, "wb") as handle:
+                            handle.write(response.read())
+
+        elif self.subset == 'baseline' or self.subset == 'daily_update':
+            for line in output.splitlines():
+                if line[-3:] == '.gz':
+                    file = os.path.join(self.save_path, line.split(' ')[-1])
+                    if not os.path.isfile(file):
+                        print('Downloading', file)
+                        response = urllib.request.urlopen(url + line.split(' ')[-1])
+                        with open(file, "wb") as handle:
+                            handle.write(response.read())
+        else:
+            assert False, 'Invalid PubMed dataset/subset specified.'
+
+    def extract_files(self):
+        files = glob.glob(self.save_path + '/*.xml.gz')
+
+        for file in files:
+            print('file:', file)
+            input = gzip.GzipFile(file, mode='rb')
+            s = input.read()
+            input.close()
+
+            out = open(file[:-3], mode='wb')
+            out.write(s)
+            out.close()
+
+
+

+ 44 - 0
TensorFlow/LanguageModeling/BERT/data/PubMedTextFormatting.py

@@ -0,0 +1,44 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+import pubmed_parser as pmp
+
+class PubMedTextFormatting:
+    def __init__(self, pubmed_path, output_filename, recursive = False):
+        self.pubmed_path = pubmed_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one article per line
+    def merge(self):
+        print('PubMed path:', self.pubmed_path)
+
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for filename in glob.glob(self.pubmed_path + '/*.xml', recursive=self.recursive):
+                print('file:', filename)
+                dicts_out = pmp.parse_medline_xml(filename)
+                for dict_out in dicts_out:
+                    if not dict_out['abstract']:
+                        continue
+                    try:
+                        for line in dict_out['abstract'].splitlines():
+                            if len(line) < 30:
+                                continue
+                            ofile.write(line.strip() + " ")
+                        ofile.write("\n\n")
+                    except:
+                        ofile.write("\n\n")
+                        continue

+ 54 - 0
TensorFlow/LanguageModeling/BERT/data/SquadDownloader.py

@@ -0,0 +1,54 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib.request
+import sys
+
+class SquadDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/squad'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        if not os.path.exists(self.save_path + '/v1.1'):
+            os.makedirs(self.save_path + '/v1.1')
+
+        if not os.path.exists(self.save_path + '/v2.0'):
+            os.makedirs(self.save_path + '/v2.0')
+
+        self.download_urls = {
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
+            'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
+            'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
+        }
+
+    def download(self):
+        for item in self.download_urls:
+            url = item
+            file = self.download_urls[item]
+
+            print('Downloading:', url)
+            if os.path.isfile(self.save_path + '/' + file):
+                print('** Download file already exists, skipping download')
+            else:
+                response = urllib.request.urlopen(url)
+                with open(self.save_path + '/' + file, "wb") as handle:
+                    handle.write(response.read())
+
+

+ 331 - 0
TensorFlow/LanguageModeling/BERT/data/TextSharding.py

@@ -0,0 +1,331 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from itertools import islice
+
+import multiprocessing
+import os
+import statistics
+
+class Sharding:
+    def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
+        assert len(input_files) > 0, 'The input file list must contain at least one file.'
+        assert n_training_shards > 0, 'There must be at least one output shard.'
+        assert n_test_shards > 0, 'There must be at least one output shard.'
+
+        self.n_training_shards = n_training_shards
+        self.n_test_shards = n_test_shards
+        self.fraction_test_set = fraction_test_set
+
+        self.input_files = input_files
+
+        self.output_name_prefix = output_name_prefix
+        self.output_training_identifier = '_training'
+        self.output_test_identifier = '_test'
+        self.output_file_extension = '.txt'
+
+        self.articles = {}    # key: integer identifier, value: list of articles
+        self.sentences = {}    # key: integer identifier, value: list of sentences
+        self.output_training_files = {}    # key: filename, value: list of articles to go into file
+        self.output_test_files = {}  # key: filename, value: list of articles to go into file
+
+        self.init_output_files()
+
+
+    # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
+    def load_articles(self):
+        print('Start: Loading Articles')
+
+        global_article_count = 0
+        for input_file in self.input_files:
+            print('input file:', input_file)
+            with open(input_file, mode='r', newline='\n') as f:
+                for i, line in enumerate(f):
+                    if line.strip():
+                        self.articles[global_article_count] = line.rstrip()
+                        global_article_count += 1
+
+        print('End: Loading Articles: There are', len(self.articles), 'articles.')
+
+
+    def segment_articles_into_sentences(self, segmenter):
+        print('Start: Sentence Segmentation')
+        if len(self.articles) is 0:
+            self.load_articles()
+
+        assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
+
+        # TODO: WIP: multiprocessing (create independent ranges and spawn processes)
+        use_multiprocessing = 'serial'
+
+        def chunks(data, size=len(self.articles)):
+            it = iter(data)
+            for i in range(0, len(data), size):
+                yield {k: data[k] for k in islice(it, size)}
+
+        if use_multiprocessing == 'manager':
+            manager = multiprocessing.Manager()
+            return_dict = manager.dict()
+            jobs = []
+            n_processes = 7    # in addition to the main process, total = n_proc+1
+
+            def work(articles, return_dict):
+                sentences = {}
+                for i, article in enumerate(articles):
+                    sentences[i] = segmenter.segment_string(articles[article])
+
+                    if i % 5000 == 0:
+                        print('Segmenting article', i)
+
+                return_dict.update(sentences)
+
+            for item in chunks(self.articles, len(self.articles)):
+                p = multiprocessing.Process(target=work, args=(item, return_dict))
+
+                # Busy wait
+                while len(jobs) >= n_processes:
+                    pass
+
+                jobs.append(p)
+                p.start()
+
+            for proc in jobs:
+                proc.join()
+
+        elif use_multiprocessing == 'queue':
+            work_queue = multiprocessing.Queue()
+            jobs = []
+
+            for item in chunks(self.articles, len(self.articles)):
+                pass
+
+        else:    # serial option
+            for i, article in enumerate(self.articles):
+                self.sentences[i] = segmenter.segment_string(self.articles[article])
+
+                if i % 5000 == 0:
+                    print('Segmenting article', i)
+
+        print('End: Sentence Segmentation')
+
+
+    def init_output_files(self):
+        print('Start: Init Output Files')
+        assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+        assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+
+        for i in range(self.n_training_shards):
+            name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
+            self.output_training_files[name] = []
+
+        for i in range(self.n_test_shards):
+            name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
+            self.output_test_files[name] = []
+
+        print('End: Init Output Files')
+
+
+    def get_sentences_per_shard(self, shard):
+        result = 0
+        for article_id in shard:
+            result += len(self.sentences[article_id])
+
+        return result
+
+
+    def distribute_articles_over_shards(self):
+        print('Start: Distribute Articles Over Shards')
+        assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
+
+        # Create dictionary with - key: sentence count per article, value: article id number
+        sentence_counts = defaultdict(lambda: [])
+
+        max_sentences = 0
+        total_sentences = 0
+
+        for article_id in self.sentences:
+            current_length = len(self.sentences[article_id])
+            sentence_counts[current_length].append(article_id)
+            max_sentences = max(max_sentences, current_length)
+            total_sentences += current_length
+
+        n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
+        nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
+        nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
+
+        consumed_article_set = set({})
+        unused_article_set = set(self.articles.keys())
+
+        # Make first pass and add one article worth of lines per file
+        for file in self.output_training_files:
+            current_article_id = sentence_counts[max_sentences][-1]
+            sentence_counts[max_sentences].pop(-1)
+            self.output_training_files[file].append(current_article_id)
+            consumed_article_set.add(current_article_id)
+            unused_article_set.remove(current_article_id)
+
+            # Maintain the max sentence count
+            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                max_sentences -= 1
+
+            if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
+                nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
+                print('Warning: A single article contains more than the nominal number of sentences per training shard.')
+
+        for file in self.output_test_files:
+            current_article_id = sentence_counts[max_sentences][-1]
+            sentence_counts[max_sentences].pop(-1)
+            self.output_test_files[file].append(current_article_id)
+            consumed_article_set.add(current_article_id)
+            unused_article_set.remove(current_article_id)
+
+            # Maintain the max sentence count
+            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                max_sentences -= 1
+
+            if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
+                nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
+                print('Warning: A single article contains more than the nominal number of sentences per test shard.')
+
+        training_counts = []
+        test_counts = []
+
+        for shard in self.output_training_files:
+            training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+
+        for shard in self.output_test_files:
+            test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+
+        training_median = statistics.median(training_counts)
+        test_median = statistics.median(test_counts)
+
+        # Make subsequent passes over files to find articles to add without going over limit
+        history_remaining = []
+        n_history_remaining = 4
+
+        while len(consumed_article_set) < len(self.articles):
+            for fidx, file in enumerate(self.output_training_files):
+                nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
+
+                # Maintain the max sentence count
+                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                    max_sentences -= 1
+
+                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+                    nominal_next_article_size -= 1
+
+                if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[fidx] > training_median:
+                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+
+                current_article_id = sentence_counts[nominal_next_article_size][-1]
+                sentence_counts[nominal_next_article_size].pop(-1)
+
+                self.output_training_files[file].append(current_article_id)
+                consumed_article_set.add(current_article_id)
+                unused_article_set.remove(current_article_id)
+
+            for fidx, file in enumerate(self.output_test_files):
+                nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
+
+                # Maintain the max sentence count
+                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                    max_sentences -= 1
+
+                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+                    nominal_next_article_size -= 1
+
+                if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[fidx] > test_median:
+                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+
+                current_article_id = sentence_counts[nominal_next_article_size][-1]
+                sentence_counts[nominal_next_article_size].pop(-1)
+
+                self.output_test_files[file].append(current_article_id)
+                consumed_article_set.add(current_article_id)
+                unused_article_set.remove(current_article_id)
+
+            # If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
+            if len(history_remaining) == n_history_remaining:
+                history_remaining.pop(0)
+            history_remaining.append(len(unused_article_set))
+
+            history_same = True
+            for i in range(1, len(history_remaining)):
+                history_same = history_same and (history_remaining[i-1] == history_remaining[i])
+
+            if history_same:
+                nominal_sentences_per_training_shard += 1
+                # nominal_sentences_per_test_shard += 1
+
+            training_counts = []
+            test_counts = []
+            for shard in self.output_training_files:
+                training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+
+            for shard in self.output_test_files:
+                test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+
+            training_median = statistics.median(training_counts)
+            test_median = statistics.median(test_counts)
+
+            print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
+
+
+        if len(unused_article_set) != 0:
+            print('Warning: Some articles did not make it into output files.')
+
+
+        for shard in self.output_training_files:
+            print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
+
+        for shard in self.output_test_files:
+            print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
+
+        print('End: Distribute Articles Over Shards')
+
+
+    def write_shards_to_disk(self):
+        print('Start: Write Shards to Disk')
+        for shard in self.output_training_files:
+            self.write_single_shard(shard, self.output_training_files[shard], 'training')
+
+        for shard in self.output_test_files:
+            self.write_single_shard(shard, self.output_test_files[shard], 'test')
+
+        print('End: Write Shards to Disk')
+
+
+    def write_single_shard(self, shard_name, shard, split):
+        shard_split = os.path.split(shard_name)
+        shard_name = shard_split[0] + '/' + split + '/' + shard_split[1]
+        
+        with open(shard_name, mode='w', newline='\n') as f:
+            for article_id in shard:
+                for line in self.sentences[article_id]:
+                    f.write(line + '\n')
+
+                f.write('\n')  # Line break between articles
+
+
+import nltk
+
+nltk.download('punkt')
+
+class NLTKSegmenter:
+    def __init(self):
+        pass
+
+    def segment_string(self, article):
+        return nltk.tokenize.sent_tokenize(article)
+

+ 58 - 0
TensorFlow/LanguageModeling/BERT/data/WikiDownloader.py

@@ -0,0 +1,58 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib.request
+import sys
+import subprocess
+
+class WikiDownloader:
+    def __init__(self, language, save_path):
+        self.save_path = save_path + '/wikicorpus_' + language
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.language = language
+        self.download_urls = {
+            'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
+            'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
+        }
+
+        self.output_files = {
+            'en' : 'wikicorpus_en.xml.bz2',
+            'zh' : 'wikicorpus_zh.xml.bz2'
+        }
+
+
+    def download(self):
+        if self.language in self.download_urls:
+            url = self.download_urls[self.language]
+            filename = self.output_files[self.language]
+
+            print('Downloading:', url)
+            if os.path.isfile(self.save_path + '/' + filename):
+                print('** Download file already exists, skipping download')
+            else:
+                response = urllib.request.urlopen(url)
+                with open(self.save_path + '/' + filename, "wb") as handle:
+                    handle.write(response.read())
+
+            # Always unzipping since this is relatively fast and will overwrite
+            print('Unzipping:', self.output_files[self.language])
+            subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
+
+        else:
+            assert False, 'WikiDownloader not implemented for this language yet.'
+

+ 46 - 0
TensorFlow/LanguageModeling/BERT/data/WikicorpusTextFormatting.py

@@ -0,0 +1,46 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+class WikicorpusTextFormatting:
+    def __init__(self, wiki_path, output_filename, recursive = False):
+        self.wiki_path = wiki_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one article per line
+    def merge(self):
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
+                for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
+                    print(filename)
+                    article_lines = []
+                    article_open = False
+
+                    with open(filename, mode='r', newline='\n') as file:
+                        for line in file:
+                            if '<doc id=' in line:
+                                article_open = True
+                            elif '</doc>' in line:
+                                article_open = False
+                                for oline in article_lines[1:]:
+                                    if oline != '\n':
+                                        ofile.write(oline.rstrip() + " ")
+                                ofile.write("\n\n")
+                                article_lines = []
+                            else:
+                                if article_open:
+                                    article_lines.append(line)

+ 12 - 0
TensorFlow/LanguageModeling/BERT/data/__init__.py

@@ -0,0 +1,12 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 389 - 0
TensorFlow/LanguageModeling/BERT/data/bertPrep.py

@@ -0,0 +1,389 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import BookscorpusTextFormatting
+import Downloader
+import TextSharding
+import WikicorpusTextFormatting
+import PubMedTextFormatting
+
+import argparse
+import itertools
+import multiprocessing
+import os
+import pprint
+import subprocess
+
+
+def main(args):
+    working_dir = os.environ['BERT_PREP_WORKING_DIR']
+
+    print('Working Directory:', working_dir)
+    print('Action:', args.action)
+    print('Dataset Name:', args.dataset)
+
+    if args.input_files:
+        args.input_files = args.input_files.split(',')
+
+    hdf5_tfrecord_folder_prefix = "/lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
+                                  + "_max_pred_" + str(args.max_predictions_per_seq) + "_masked_lm_prob_" + str(args.masked_lm_prob) \
+                                  + "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor) \
+                                  + "_shard_" + str(args.n_training_shards) + "_test_split_" + str(int(args.fraction_test_set * 100))
+    directory_structure = {
+        'download' : working_dir + '/download',    # Downloaded and decompressed
+        'extracted' : working_dir +'/extracted',    # Extracted from whatever the initial format is (e.g., wikiextractor)
+        'formatted' : working_dir + '/formatted_one_article_per_line',    # This is the level where all sources should look the same
+        'sharded' : working_dir + '/sharded',
+        'tfrecord' : working_dir + '/tfrecord' + hdf5_tfrecord_folder_prefix,
+        'hdf5': working_dir + '/hdf5'+ hdf5_tfrecord_folder_prefix,
+    }
+
+    print('\nDirectory Structure:')
+    pp = pprint.PrettyPrinter(indent=2)
+    pp.pprint(directory_structure)
+    print('')
+
+    if args.action == 'download':
+        if not os.path.exists(directory_structure['download']):
+            os.makedirs(directory_structure['download'])
+
+        downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
+        downloader.download()
+
+    elif args.action == 'text_formatting':
+        assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' \
+               and args.dataset != 'squad' and args.dataset != 'MRPC' and args.dataset != 'CoLA' and \
+               args.dataset != 'MNLI', 'Cannot perform text_formatting on pretrained weights'
+
+        if not os.path.exists(directory_structure['extracted']):
+            os.makedirs(directory_structure['extracted'])
+
+        if not os.path.exists(directory_structure['formatted']):
+            os.makedirs(directory_structure['formatted'])
+
+        if args.dataset == 'bookscorpus':
+            books_path = directory_structure['download'] + '/bookscorpus'
+            #books_path = directory_structure['download']
+            output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
+            books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
+            books_formatter.merge()
+
+        elif args.dataset == 'wikicorpus_en':
+            if args.skip_wikiextractor == 0:
+                path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
+                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
+                print('WikiExtractor Command:', wikiextractor_command)
+                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+
+            wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
+            output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
+            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+            wiki_formatter.merge()
+
+        elif args.dataset == 'wikicorpus_zh':
+            assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
+            if args.skip_wikiextractor == 0:
+                path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
+                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
+                print('WikiExtractor Command:', wikiextractor_command)
+                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+
+            wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
+            output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
+            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+            wiki_formatter.merge()
+
+        elif args.dataset == 'pubmed_baseline':
+            pubmed_path = directory_structure['download'] + '/pubmed' + '/baseline'
+            output_filename = directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt'
+            pubmed_formatter = PubMedTextFormatting.PubMedTextFormatting(pubmed_path, output_filename, recursive=True)
+            pubmed_formatter.merge()
+
+    elif args.action == 'sharding':
+        # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
+        if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset or 'pubmed' in args.dataset:
+            if args.input_files is None:
+                if args.dataset == 'bookscorpus':
+                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
+                elif args.dataset == 'wikicorpus_en':
+                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
+                elif args.dataset == 'wikicorpus_zh':
+                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
+                elif args.dataset == 'books_wiki_en_corpus':
+                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
+                elif args.dataset == 'pubmed_baseline':
+                    args.input_files = [directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt']
+
+            output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
+
+            if not os.path.exists(directory_structure['sharded']):
+                os.makedirs(directory_structure['sharded'])
+
+            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
+                os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
+                
+            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/training'):
+                os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/training')
+                
+            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/test'):
+                os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/test')
+
+            # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
+            # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
+            # Different languages (e.g., Chinese simplified/traditional) may require translation and
+            # other packages to be called from here -- just add a conditional branch for those extra steps
+            segmenter = TextSharding.NLTKSegmenter()
+            sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
+
+            sharding.load_articles()
+            sharding.segment_articles_into_sentences(segmenter)
+            sharding.distribute_articles_over_shards()
+            sharding.write_shards_to_disk()
+
+        else:
+            assert False, 'Unsupported dataset for sharding'
+
+    elif args.action == 'create_tfrecord_files':
+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
+        
+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/training'):
+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/training')
+            
+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/test'):
+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/test')
+
+        last_process = None
+
+        def create_record_worker(filename_prefix, shard_id, output_format='tfrecord', split='training'):
+            bert_preprocessing_command = 'python /workspace/bert/utils/create_pretraining_data.py'
+            bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
+            bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
+            bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
+            bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
+            bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
+            bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
+            bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
+            bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
+            bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
+            bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
+            bert_preprocessing_process.communicate()
+
+            last_process = bert_preprocessing_process
+
+            # This could be better optimized (fine if all take equal time)
+            if shard_id % args.n_processes == 0 and shard_id > 0:
+                bert_preprocessing_process.wait()
+
+            return last_process
+
+        output_file_prefix = args.dataset
+
+        for i in range(args.n_training_shards):
+            last_process = create_record_worker(output_file_prefix + '_training', i, 'tfrecord', 'training')
+
+        last_process.wait()
+
+        for i in range(args.n_test_shards):
+            last_process = create_record_worker(output_file_prefix + '_test', i, 'tfrecord', 'test')
+
+        last_process.wait()
+
+
+    elif args.action == 'create_hdf5_files':
+        assert False, 'HDF5 format not fully supported in this release.'
+
+        if not os.path.exists(directory_structure['hdf5'] + "/" + args.dataset):
+            os.makedirs(directory_structure['hdf5'] + "/" + args.dataset)
+
+        last_process = None
+
+        def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
+            bert_preprocessing_command = 'python /workspace/bert/utils/create_pretraining_data.py'
+            bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
+            bert_preprocessing_command += ' --output_file=' + directory_structure['hdf5'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
+            bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
+            bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
+            bert_preprocessing_command += ' --max_seq_length=' + args.max_seq_length
+            bert_preprocessing_command += ' --max_predictions_per_seq=' + args.max_predictions_per_seq
+            bert_preprocessing_command += ' --masked_lm_prob=' + args.masked_lm_prob
+            bert_preprocessing_command += ' --random_seed=' + args.random_seed
+            bert_preprocessing_command += ' --dupe_factor=' + args.dupe_factor
+            bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
+            bert_preprocessing_process.communicate()
+
+            last_process = bert_preprocessing_process
+
+            # This could be better optimized (fine if all take equal time)
+            if shard_id % args.n_processes == 0 and shard_id > 0:
+                bert_preprocessing_process.wait()
+
+        for i in range(args.n_training_shards):
+            create_record_worker(args.output_file_prefix + '_training', i)
+
+        last_process.wait()
+
+        for i in range(args.n_test_shards):
+            create_record_worker(args.output_file_prefix + '_test', i)
+
+        last_process.wait()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Preprocessing Application for Everything BERT-related'
+    )
+
+    parser.add_argument(
+        '--action',
+        type=str,
+        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
+        choices={
+            'download',                   # Download and verify mdf5/sha sums
+            'text_formatting',            # Convert into a file that contains one article/book per line
+            'sharding',                   # Convert previous formatted text into shards containing one sentence per line
+            'create_tfrecord_files',      # Turn each shard into a TFrecord with masking and next sentence prediction info
+            'create_hdf5_files'           # Turn each shard into a HDF5 file with masking and next sentence prediction info
+        }
+    )
+
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        help='Specify the dataset to perform --action on',
+        choices={
+            'bookscorpus',
+            'wikicorpus_en',
+            'wikicorpus_zh',
+            'books_wiki_en_corpus',
+            'pubmed_baseline',
+            'pubmed_daily_update',
+            'pubmed_fulltext',
+            'pubmed_open_access',
+            'google_pretrained_weights',
+            'nvidia_pretrained_weights',
+            'squad',
+            'MRPC',
+            'CoLA',
+            'MNLI',
+            'all'
+        }
+    )
+
+    parser.add_argument(
+        '--input_files',
+        type=str,
+        help='Specify the input files in a comma-separated list (no spaces)'
+    )
+
+    parser.add_argument(
+        '--n_training_shards',
+        type=int,
+        help='Specify the number of training shards to generate',
+        default=256
+    )
+
+    parser.add_argument(
+        '--n_test_shards',
+        type=int,
+        help='Specify the number of test shards to generate',
+        default=256
+    )
+
+    parser.add_argument(
+        '--fraction_test_set',
+        type=float,
+        help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
+        default=0.2
+    )
+
+    parser.add_argument(
+        '--segmentation_method',
+        type=str,
+        help='Specify your choice of sentence segmentation',
+        choices={
+            'nltk'
+        },
+        default='nltk'
+    )
+
+    parser.add_argument(
+        '--n_processes',
+        type=int,
+        help='Specify the max number of processes to allow at one time',
+        default=4
+    )
+
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        help='Specify the base seed to use for any random number generation',
+        default=12345
+    )
+
+    parser.add_argument(
+        '--dupe_factor',
+        type=int,
+        help='Specify the duplication factor',
+        default=5
+    )
+
+    parser.add_argument(
+        '--masked_lm_prob',
+        type=float,
+        help='Specify the probability for masked lm',
+        default=0.15
+    )
+
+    parser.add_argument(
+        '--max_seq_length',
+        type=int,
+        help='Specify the maximum sequence length',
+        default=512
+    )
+
+    parser.add_argument(
+        '--max_predictions_per_seq',
+        type=int,
+        help='Specify the maximum number of masked words per sequence',
+        default=20
+    )
+
+    parser.add_argument(
+        '--do_lower_case',
+        type=int,
+        help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
+        default=1
+    )
+
+    parser.add_argument(
+        '--vocab_file',
+        type=str,
+        help='Specify absolute path to vocab file to use)'
+    )
+
+    parser.add_argument(
+        '--skip_wikiextractor',
+        type=int,
+        help='Specify whether to skip wikiextractor step 0=False, 1=True',
+        default=0
+    )
+
+    parser.add_argument(
+        '--interactive_json_config_generator',
+        type=str,
+        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
+    )
+
+    args = parser.parse_args()
+    main(args)

+ 0 - 15
TensorFlow/LanguageModeling/BERT/data/bookcorpus/clean_and_merge_text.py

@@ -1,15 +0,0 @@
-# NVIDIA
-
-import glob
-import os
-
-output_file = os.environ['WORKING_DIR'] + '/intermediate_files/bookcorpus.txt'
-download_path = os.environ['WORKING_DIR'] + '/download/'
-
-with open(output_file, "w") as ofile:
-  for filename in glob.glob(download_path + '*.txt', recursive=True):
-    with open(filename, mode='r', encoding="utf-8-sig") as file:
-      for line in file:
-        if line.strip() != "":
-          ofile.write(line.strip() + " ")
-    ofile.write("\n\n ")

+ 0 - 27
TensorFlow/LanguageModeling/BERT/data/bookcorpus/config.sh

@@ -1,27 +0,0 @@
-#! /bin/bash
-
-set -e
-
-USE_BERT_LARGE=true
-MAX_SEQUENCE_LENGTH=512
-MAX_PREDICTIONS_PER_SEQUENCE=80
-MASKED_LM_PROB=0.15
-SEED=12345
-DUPE_FACTOR=5
-DO_LOWER_CASE="True"
-N_LINES_PER_SHARD_APPROX=396000   # Default=396000 creates 256 shards
-
-N_PROCS_PREPROCESS=4    # Adjust this based on memory requirements and available number of cores
-export WORKING_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-BERT_BASE_DIR="${WORKING_DIR}/../pretrained_models_google/uncased_L-12_H-768_A-12"
-BERT_LARGE_DIR="${WORKING_DIR}/../pretrained_models_google/uncased_L-24_H-1024_A-16"
-
-if [ "$USE_BERT_LARGE" = true ] ; then
-  VOCAB_FILE="${BERT_LARGE_DIR}/vocab.txt"
-else
-  VOCAB_FILE="${BERT_BASE_DIR}/vocab.txt"
-fi
-
-OUTPUT_DIR="${WORKING_DIR}/final_tfrecords_sharded/bert_large_bookcorpus_seq_${MAX_SEQUENCE_LENGTH}_pred_${MAX_PREDICTIONS_PER_SEQUENCE}"
-

+ 0 - 18
TensorFlow/LanguageModeling/BERT/data/bookcorpus/create_pseudo_test_set.py

@@ -1,18 +0,0 @@
-# NVIDIA
-
-import glob
-import os
-import random
-import shutil
-
-input_dir = os.environ['WORKING_DIR'] + '/final_text_files_sharded/'
-output_dir = os.environ['WORKING_DIR'] + '/test_set_text_files/'
-
-random.seed(13254)
-n_shards_to_keep = 3
-
-file_glob = glob.glob(input_dir + '/*', recursive=False)
-file_glob = random.sample(file_glob, n_shards_to_keep)
-
-for filename in file_glob:
-  shutil.copy(filename, output_dir) 

+ 0 - 10
TensorFlow/LanguageModeling/BERT/data/bookcorpus/create_pseudo_test_set.sh

@@ -1,10 +0,0 @@
-#! /bin/bash
-
-source /workspace/bert/data/bookcorpus/config.sh
-
-# Convert test set sharded text files into tfrecords that are ready for BERT pretraining
-echo "Creating test set tfrecords for each text shard"
-mkdir -p ${WORKING_DIR}/test_set_text_files
-mkdir -p ${WORKING_DIR}/test_set_tfrecords
-python3 ${WORKING_DIR}/create_pseudo_test_set.py
-. ${WORKING_DIR}/preprocessing_test_set_xargs_wrapper.sh ${N_PROCS_PREPROCESS}

+ 0 - 23
TensorFlow/LanguageModeling/BERT/data/bookcorpus/preprocessing.sh

@@ -1,23 +0,0 @@
-#! /bin/bash
-
-SHARD_INDEX=${1}
-INPUT_FILE="${WORKING_DIR}/final_text_files_sharded/bookcorpus.segmented.part.${SHARD_INDEX}.txt"
-
-source /workspace/bert/data/bookcorpus/config.sh
-
-OUTPUT_DIR=${WORKING_DIR}/final_tfrecords_sharded
-mkdir -p ${OUTPUT_DIR}
-
-OUTPUT_FILE="${OUTPUT_DIR}/tf_examples.tfrecord000${SHARD_INDEX}"
-
-python /workspace/bert/utils/create_pretraining_data.py \
-  --input_file=${INPUT_FILE} \
-  --output_file=${OUTPUT_FILE} \
-  --vocab_file=${VOCAB_FILE} \
-  --do_lower_case=${DO_LOWER_CASE} \
-  --max_seq_length=${MAX_SEQUENCE_LENGTH} \
-  --max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \
-  --masked_lm_prob=${MASKED_LM_PROB} \
-  --random_seed=${SEED} \
-  --dupe_factor=${DUPE_FACTOR}
-

+ 0 - 28
TensorFlow/LanguageModeling/BERT/data/bookcorpus/preprocessing_test_set.sh

@@ -1,28 +0,0 @@
-#! /bin/bash
-
-INPUT_FILE=${1}
-
-source /workspace/bert/data/bookcorpus/config.sh
-
-OUTPUT_DIR=${WORKING_DIR}/test_set_tfrecords
-mkdir -p ${OUTPUT_DIR}
-
-#SHARD_INDEX=$(( echo ${INPUT_FILE} | egrep -o [0-9]+ ))
-SHARD_INDEX=$( eval echo ${INPUT_FILE} | sed -e s/[^0-9]//g )
-OUTPUT_FILE="${OUTPUT_DIR}/tf_examples.tfrecord000${SHARD_INDEX}"
-
-SEED=13254
-
-echo "Shard index ${SHARD_INDEX}"
-
-python /workspace/bert/utils/create_pretraining_data.py \
-  --input_file=${INPUT_FILE} \
-  --output_file=${OUTPUT_FILE} \
-  --vocab_file=${VOCAB_FILE} \
-  --do_lower_case=${DO_LOWER_CASE} \
-  --max_seq_length=${MAX_SEQUENCE_LENGTH} \
-  --max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \
-  --masked_lm_prob=${MASKED_LM_PROB} \
-  --random_seed=${SEED} \
-  --dupe_factor=${DUPE_FACTOR}
-

+ 0 - 12
TensorFlow/LanguageModeling/BERT/data/bookcorpus/preprocessing_test_set_xargs_wrapper.sh

@@ -1,12 +0,0 @@
-#! /bin/bash
-
-source /workspace/bert/data/bookcorpus/config.sh
-
-SHARD_COUNT=0
-rm -rf /workspace/bert/data/bookcorpus/xarg_list.txt
-touch /workspace/bert/data/bookcorpus/xarg_list.txt
-for file in /workspace/bert/data/bookcorpus/test_set_text_files/*; do
-  echo ${file} >> /workspace/bert/data/bookcorpus/xarg_list.txt
-done
-
-xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=/workspace/bert/data/bookcorpus/xarg_list.txt /workspace/bert/data/bookcorpus/preprocessing_test_set.sh

+ 0 - 13
TensorFlow/LanguageModeling/BERT/data/bookcorpus/preprocessing_xargs_wrapper.sh

@@ -1,13 +0,0 @@
-#! /bin/bash
-
-source /workspace/bert/data/bookcorpus/config.sh
-
-SHARD_COUNT=0
-rm -rf /workspace/bert/data/bookcorpus/xarg_list.txt
-touch /workspace/bert/data/bookcorpus/xarg_list.txt
-for file in /workspace/bert/data/bookcorpus/final_text_files_sharded/*; do
-  echo ${SHARD_COUNT} >> /workspace/bert/data/bookcorpus/xarg_list.txt
-  SHARD_COUNT=$((SHARD_COUNT+1))
-done
-
-xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=/workspace/bert/data/bookcorpus/xarg_list.txt /workspace/bert/data/bookcorpus/preprocessing.sh

+ 0 - 28
TensorFlow/LanguageModeling/BERT/data/bookcorpus/run_preprocessing.sh

@@ -1,28 +0,0 @@
-#! /bin/bash
-
-source /workspace/bert/data/bookcorpus/config.sh
-
-# Download books
-mkdir -p download
-python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out ${WORKING_DIR}/download --trash-bad-count
-
-# Clean and prep (one book per line)
-mkdir -p ${WORKING_DIR}/intermediate_files
-python3 ${WORKING_DIR}/clean_and_merge_text.py
-
-# Split books into one-sentence-per-line format for use with BERT scripts
-echo "Applying sentence segmentation to get one sentence per line"
-mkdir -p ${WORKING_DIR}/final_text_file_single
-python3 ${WORKING_DIR}/sentence_segmentation_nltk.py
-# Note: NLTK can be replaced with Spacy, although it is slower (2 variations provided)
-
-# Shard finalized text so that it has a chance of fitting in memory when creating pretraining data into tfrecords (choose appropriate number of shards for distributed training)
-echo "Shard text files - size is approximate to prevent splitting a book across shards"
-mkdir -p ${WORKING_DIR}/final_text_files_sharded
-python3 ${WORKING_DIR}/shard_text_input_file.py
-
-# Convert sharded text files into tfrecords that are ready for BERT pretraining
-echo "Creating tfrecords for each text shard"
-mkdir -p ${WORKING_DIR}/final_tfrecords_sharded
-. ${WORKING_DIR}/preprocessing_xargs_wrapper.sh ${N_PROCS_PREPROCESS}
-

+ 0 - 20
TensorFlow/LanguageModeling/BERT/data/bookcorpus/sentence_segmentation_nltk.py

@@ -1,20 +0,0 @@
-# NVIDIA
-
-import nltk
-import os
-
-nltk.download('punkt')
-
-input_file = os.environ['WORKING_DIR'] + '/intermediate_files/bookcorpus.txt'
-output_file = os.environ['WORKING_DIR'] + '/final_text_file_single/bookcorpus.segmented.nltk.txt'
-
-doc_seperator = "\n"
-
-with open(input_file) as ifile:
-  with open(output_file, "w") as ofile:
-    for line in ifile:
-      if line != "\n":
-        sent_list = nltk.tokenize.sent_tokenize(line)
-        for sent in sent_list:
-          ofile.write(sent + "\n")
-        ofile.write(doc_seperator)

+ 0 - 41
TensorFlow/LanguageModeling/BERT/data/bookcorpus/shard_text_input_file.py

@@ -1,41 +0,0 @@
-# NVIDIA
-
-import os
-
-input_file = os.environ['WORKING_DIR'] + '/final_text_file_single/bookcorpus.segmented.nltk.txt'
-output_file = os.environ['WORKING_DIR'] + '/final_text_files_sharded/bookcorpus.segmented.part.'
-
-doc_seperator = "\n"
-
-line_buffer = []
-shard_size = 396000 # Approximate, will split at next article break
-line_counter = 0
-shard_index = 0
-
-ifile_lines = 0
-with open(input_file) as ifile:
-  for line in ifile:
-    ifile_lines += 1
-
-print("Input file contains", ifile_lines, "lines.")
-
-iline_counter = 1
-with open(input_file) as ifile:
-  for line in ifile:
-    if line_counter < shard_size and iline_counter < ifile_lines:
-      line_buffer.append(line)
-      line_counter += 1
-      iline_counter += 1
-    elif line_counter >= shard_size and line != "\n" and iline_counter < ifile_lines:
-      line_buffer.append(line)
-      line_counter += 1
-      iline_counter += 1
-    else:
-       with open(output_file + str(shard_index) + ".txt", "w") as ofile:
-         for oline in line_buffer:
-           ofile.write(oline)
-         line_buffer = []
-         line_counter = 0
-         shard_index += 1
-
-    

+ 46 - 0
TensorFlow/LanguageModeling/BERT/data/create_datasets_from_start.sh

@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export BERT_PREP_WORKING_DIR="${BERT_PREP_WORKING_DIR}"
+
+# Download
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset bookscorpus
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset wikicorpus_en
+
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset google_pretrained_weights  # Includes vocab
+
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset squad
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset "CoLA"
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset "MRPC"
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset "MNLI"
+
+
+# Properly format the text files
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset bookscorpus
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset wikicorpus_en
+
+
+# Shard the text files (group wiki+books then shard)
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action sharding --dataset books_wiki_en_corpus
+
+
+# Create TFRecord files Phase 1
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset books_wiki_en_corpus --max_seq_length 128 \
+ --max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
+
+
+# Create TFRecord files Phase 2
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset books_wiki_en_corpus --max_seq_length 512 \
+ --max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt

+ 0 - 153
TensorFlow/LanguageModeling/BERT/data/glue/download_glue_data.py

@@ -1,153 +0,0 @@
-#
-#
-#  @unpublished{wang2018glue
-#      title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for
-#              Natural Language Understanding}
-#      author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill,
-#              Felix and Levy, Omer and Bowman, Samuel R.}
-#      note={arXiv preprint 1804.07461}
-#      year={2018}
-#  }
-#
-#  Script for downloading all GLUE data.
-# Note: for legal reasons, we are unable to host MRPC.
-# You can either use the version hosted by the SentEval team, which is already tokenized,
-# or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
-# For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
-# You should then rename and place specific files in a folder (see below for an example).
-# mkdir MRPC
-# cabextract MSRParaphraseCorpus.msi -d MRPC
-# cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
-# cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
-# rm MRPC/_*
-# rm MSRParaphraseCorpus.msi
-
-
-import os
-import sys
-import shutil
-import argparse
-import tempfile
-import urllib
-import io
-if sys.version_info >= (3, 0):
-    import urllib.request
-import zipfile
-
-URLLIB=urllib
-if sys.version_info >= (3, 0):
-    URLLIB=urllib.request
-
-TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
-TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
-             "SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
-             "MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
-             "QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
-             "STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
-             "MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
-             "SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
-             "QNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLI.zip?alt=media&token=c24cad61-f2df-4f04-9ab6-aa576fa829d0',
-             "RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
-             "WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
-             "diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
-
-MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
-MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
-
-def download_and_extract(task, data_dir):
-    print("Downloading and extracting %s..." % task)
-    data_file = "%s.zip" % task
-    URLLIB.urlretrieve(TASK2PATH[task], data_file)
-    with zipfile.ZipFile(data_file) as zip_ref:
-        zip_ref.extractall(data_dir)
-    os.remove(data_file)
-    print("\tCompleted!")
-
-def format_mrpc(data_dir, path_to_data):
-    print("Processing MRPC...")
-    mrpc_dir = os.path.join(data_dir, "MRPC")
-    if not os.path.isdir(mrpc_dir):
-        os.mkdir(mrpc_dir)
-    if path_to_data:
-        mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
-        mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
-    else:
-        mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
-        mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
-        URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file)
-        URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file)
-    assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
-    assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
-    URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
-
-    dev_ids = []
-    with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
-        for row in ids_fh:
-            dev_ids.append(row.strip().split('\t'))
-
-    with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
-         io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
-         io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
-        header = data_fh.readline()
-        train_fh.write(header)
-        dev_fh.write(header)
-        for row in data_fh:
-            label, id1, id2, s1, s2 = row.strip().split('\t')
-            if [id1, id2] in dev_ids:
-                dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
-            else:
-                train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
-
-    with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
-            io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
-        header = data_fh.readline()
-        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
-        for idx, row in enumerate(data_fh):
-            label, id1, id2, s1, s2 = row.strip().split('\t')
-            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
-    print("\tCompleted!")
-
-def download_diagnostic(data_dir):
-    print("Downloading and extracting diagnostic...")
-    if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
-        os.mkdir(os.path.join(data_dir, "diagnostic"))
-    data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
-    URLLIB.urlretrieve(TASK2PATH["diagnostic"], data_file)
-    print("\tCompleted!")
-    return
-
-def get_tasks(task_names):
-    task_names = task_names.split(',')
-    if "all" in task_names:
-        tasks = TASKS
-    else:
-        tasks = []
-        for task_name in task_names:
-            assert task_name in TASKS, "Task %s not found!" % task_name
-            tasks.append(task_name)
-    return tasks
-
-def main(arguments):
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-d', '--data_dir', help='directory to save data to', type=str, default='.')
-    parser.add_argument('-t', '--tasks', help='tasks to download data for as a comma separated string',
-                        type=str, default='all')
-    parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
-                        type=str, default='')
-    args = parser.parse_args(arguments)
-
-    if not os.path.isdir(args.data_dir):
-        os.mkdir(args.data_dir)
-    tasks = get_tasks(args.tasks)
-
-    for task in tasks:
-        if task == 'MRPC':
-            format_mrpc(args.data_dir, args.path_to_mrpc)
-        elif task == 'diagnostic':
-            download_diagnostic(args.data_dir)
-        else:
-            download_and_extract(task, args.data_dir)
-
-
-if __name__ == '__main__':
-    sys.exit(main(sys.argv[1:]))

+ 0 - 123
TensorFlow/LanguageModeling/BERT/data/pretrained_models_google/download_models.py

@@ -1,123 +0,0 @@
-# NVIDIA
-
-import hashlib
-import urllib.request
-import zipfile
-
-# Download urls
-model_urls = {
-  'bert_base_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
-  'bert_large_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
-  'bert_base_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
-  'bert_large_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
-  'bert_base_multilingual_cased' : ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
-  'bert_large_multilingual_uncased' : ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
-  'bert_base_chinese' : ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
-}
-
-# SHA256sum verification for file download integrity (and checking for changes from the download source over time)
-bert_base_uncased_sha = {
-  'bert_config.json' : '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
-  'bert_model.ckpt.data-00000-of-00001' : '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
-  'bert_model.ckpt.index' : '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
-  'bert_model.ckpt.meta' : 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
-  'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
-}
-
-bert_large_uncased_sha = {
-  'bert_config.json' : 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
-  'bert_model.ckpt.data-00000-of-00001' : 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
-  'bert_model.ckpt.index' : '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
-  'bert_model.ckpt.meta' : '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
-  'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
-}
-
-bert_base_cased_sha = {
-  'bert_config.json' : 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
-  'bert_model.ckpt.data-00000-of-00001' : '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
-  'bert_model.ckpt.index' : '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
-  'bert_model.ckpt.meta' : '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
-  'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
-}
-
-bert_large_cased_sha = {
-  'bert_config.json' : '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
-  'bert_model.ckpt.data-00000-of-00001' : '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
-  'bert_model.ckpt.index' : 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
-  'bert_model.ckpt.meta' : 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
-  'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
-}
-
-bert_base_multilingual_cased_sha = {
-  'bert_config.json' : 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
-  'bert_model.ckpt.data-00000-of-00001' : '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
-  'bert_model.ckpt.index' : '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
-  'bert_model.ckpt.meta' : '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
-  'vocab.txt' : 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
-}
-
-bert_large_multilingual_uncased_sha = {
-  'bert_config.json' : '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
-  'bert_model.ckpt.data-00000-of-00001' : '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
-  'bert_model.ckpt.index' : '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
-  'bert_model.ckpt.meta' : '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
-  'vocab.txt' : '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
-}
-
-bert_base_chinese_sha = {
-  'bert_config.json' : '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
-  'bert_model.ckpt.data-00000-of-00001' : '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
-  'bert_model.ckpt.index' : '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
-  'bert_model.ckpt.meta' : 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
-  'vocab.txt' : '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
-}
-
-# Relate SHA to urls for loop below
-model_sha = {
-  'bert_base_uncased' : bert_base_uncased_sha,
-  'bert_large_uncased' : bert_large_uncased_sha,
-  'bert_base_cased' : bert_base_cased_sha,
-  'bert_large_cased' : bert_large_cased_sha,
-  'bert_base_multilingual_cased' : bert_base_multilingual_cased_sha,
-  'bert_large_multilingual_uncased' : bert_large_multilingual_uncased_sha,
-  'bert_base_chinese' : bert_base_chinese_sha
-}
-
-# Helper to get sha256sum of a file
-def sha256sum(filename):
-  h  = hashlib.sha256()
-  b  = bytearray(128*1024)
-  mv = memoryview(b)
-  with open(filename, 'rb', buffering=0) as f:
-    for n in iter(lambda : f.readinto(mv), 0):
-      h.update(mv[:n])
-  return h.hexdigest()
-
-# Iterate over urls: download, unzip, verify sha256sum
-found_mismatch_sha = False
-for model in model_urls:
-  url = model_urls[model][0]
-  file = model_urls[model][1]
-
-  print("Downloading", url)
-  response = urllib.request.urlopen(url)
-  with open(file, "wb") as handle:
-    handle.write(response.read())
-
-  print("Unzipping", file)
-  zip = zipfile.ZipFile(file, 'r')
-  zip.extractall()
-  zip.close()
-
-  sha_dict = model_sha[model]
-  for extracted_file in sha_dict:
-    sha = sha_dict[extracted_file]
-    if sha != sha256sum(file[:-4] + "/" + extracted_file):
-      found_mismatch_sha = True
-      print("SHA256sum does not match on file:", extracted_file, "from download url:", url)
-    else:
-      print(file[:-4] + "/" + extracted_file, "\t", "verified")
-
-if not found_mismatch_sha:
-  print("All downloads pass sha256sum verification.")
-

+ 0 - 60
TensorFlow/LanguageModeling/BERT/data/squad/squad_download.sh

@@ -1,60 +0,0 @@
-#!/usr/bin/env bash
-
-echo "Downloading dataset for squad..."
-
-# Download SQuAD
-
-v1="v1.1"
-mkdir $v1
-wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $v1/train-v1.1.json
-wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $v1/dev-v1.1.json
-wget https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/ -O $v1/evaluate-v1.1.py
-
-EXP_TRAIN_v1='981b29407e0affa3b1b156f72073b945  -'
-EXP_DEV_v1='3e85deb501d4e538b6bc56f786231552  -'
-EXP_EVAL_v1='afb04912d18ff20696f7f88eed49bea9  -'
-CALC_TRAIN_v1=`cat ${v1}/train-v1.1.json |md5sum`
-CALC_DEV_v1=`cat ${v1}/dev-v1.1.json |md5sum`
-CALC_EVAL_v1=`cat ${v1}/evaluate-v1.1.py |md5sum`
-
-v2="v2.0"
-mkdir $v2
-wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O $v2/train-v2.0.json
-wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O $v2/dev-v2.0.json
-wget https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -O $v2/evaluate-v2.0.py
-
-EXP_TRAIN_v2='62108c273c268d70893182d5cf8df740  -'
-EXP_DEV_v2='246adae8b7002f8679c027697b0b7cf8  -'
-EXP_EVAL_v2='ff23213bed5516ea4a6d9edb6cd7d627  -'
-
-CALC_TRAIN_v2=`cat ${v2}/train-v2.0.json |md5sum`
-CALC_DEV_v2=`cat ${v2}/dev-v2.0.json |md5sum`
-CALC_EVAL_v2=`cat ${v2}/evaluate-v2.0.py |md5sum`
-
-echo "Squad data download done!"
-
-echo "Verifying Dataset...."
-
-if [ "$EXP_TRAIN_v1" != "$CALC_TRAIN_v1" ]; then
-    echo "train-v1.1.json is corrupted! md5sum doesn't match"
-fi
-
-if [ "$EXP_DEV_v1" != "$CALC_DEV_v1" ]; then
-    echo "dev-v1.1.json is corrupted! md5sum doesn't match"
-fi
-if [ "$EXP_EVAL_v1" != "$CALC_EVAL_v1" ]; then
-    echo "evaluate-v1.1.py is corrupted! md5sum doesn't match"
-fi
-
-
-if [ "$EXP_TRAIN_v2" != "$CALC_TRAIN_v2" ]; then
-    echo "train-v2.0.json is corrupted! md5sum doesn't match"
-fi
-if [ "$EXP_DEV_v2" != "$CALC_DEV_v2" ]; then
-    echo "dev-v2.0.json is corrupted! md5sum doesn't match"
-fi
-if [ "$EXP_EVAL_v2" != "$CALC_EVAL_v2" ]; then
-    echo "evaluate-v2.0.py is corrupted! md5sum doesn't match"
-fi
-
-echo "SQuAD download complete!"

+ 0 - 28
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/config.sh

@@ -1,28 +0,0 @@
-#! /bin/bash
-
-set -e
-
-USE_BERT_LARGE=true
-MAX_SEQUENCE_LENGTH=512
-MAX_PREDICTIONS_PER_SEQUENCE=80
-MASKED_LM_PROB=0.15
-SEED=12345
-DUPE_FACTOR=5
-DO_LOWER_CASE="True"
-N_LINES_PER_SHARD_APPROX=396000   # Default=396000 creates 256 shards
-
-N_PROCS_PREPROCESS=4    # Adjust this based on memory requirements and available number of cores
-export WORKING_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-WIKI_DUMP="ftp://ftpmirror.your.org/pub/wikimedia/dumps/enwiki/20190301/enwiki-20190301-pages-articles-multistream.xml.bz2"
-BERT_BASE_DIR="${WORKING_DIR}/../pretrained_models_google/uncased_L-12_H-768_A-12"
-BERT_LARGE_DIR="${WORKING_DIR}/../pretrained_models_google/uncased_L-24_H-1024_A-16"
-
-if [ "$USE_BERT_LARGE" = true ] ; then
-  VOCAB_FILE="${BERT_LARGE_DIR}/vocab.txt"
-else
-  VOCAB_FILE="${BERT_BASE_DIR}/vocab.txt"
-fi
-
-OUTPUT_DIR="${WORKING_DIR}/final_tfrecords_sharded/bert_large_wikipedia_seq_${MAX_SEQUENCE_LENGTH}_pred_${MAX_PREDICTIONS_PER_SEQUENCE}"
-

+ 0 - 18
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/create_pseudo_test_set.py

@@ -1,18 +0,0 @@
-# NVIDIA
-
-import glob
-import os
-import random
-import shutil
-
-input_dir = os.environ['WORKING_DIR'] + '/final_text_files_sharded/'
-output_dir = os.environ['WORKING_DIR'] + '/test_set_text_files/'
-
-random.seed(13254)
-n_shards_to_keep = 3
-
-file_glob = glob.glob(input_dir + '/*', recursive=False)
-file_glob = random.sample(file_glob, n_shards_to_keep)
-
-for filename in file_glob:
-  shutil.copy(filename, output_dir) 

+ 0 - 10
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/create_pseudo_test_set.sh

@@ -1,10 +0,0 @@
-#! /bin/bash
-
-source /workspace/bert/data/wikipedia_corpus/config.sh
-
-# Convert test set sharded text files into tfrecords that are ready for BERT pretraining
-echo "Creating test set tfrecords for each text shard"
-mkdir -p ${WORKING_DIR}/test_set_text_files
-mkdir -p ${WORKING_DIR}/test_set_tfrecords
-python3 ${WORKING_DIR}/create_pseudo_test_set.py
-. ${WORKING_DIR}/preprocessing_test_set_xargs_wrapper.sh ${N_PROCS_PREPROCESS}

+ 0 - 23
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/preprocessing.sh

@@ -1,23 +0,0 @@
-#! /bin/bash
-
-SHARD_INDEX=${1}
-INPUT_FILE="${WORKING_DIR}/final_text_files_sharded/wikipedia.segmented.part.${SHARD_INDEX}.txt"
-
-source /workspace/bert/data/wikipedia_corpus/config.sh
-
-OUTPUT_DIR=${WORKING_DIR}/final_tfrecords_sharded
-mkdir -p ${OUTPUT_DIR}
-
-OUTPUT_FILE="${OUTPUT_DIR}/tf_examples.tfrecord000${SHARD_INDEX}"
-
-python /workspace/bert/utils/create_pretraining_data.py \
-  --input_file=${INPUT_FILE} \
-  --output_file=${OUTPUT_FILE} \
-  --vocab_file=${VOCAB_FILE} \
-  --do_lower_case=${DO_LOWER_CASE} \
-  --max_seq_length=${MAX_SEQUENCE_LENGTH} \
-  --max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \
-  --masked_lm_prob=${MASKED_LM_PROB} \
-  --random_seed=${SEED} \
-  --dupe_factor=${DUPE_FACTOR}
-

+ 0 - 28
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/preprocessing_test_set.sh

@@ -1,28 +0,0 @@
-#! /bin/bash
-
-INPUT_FILE=${1}
-
-source /workspace/bert/data/wikipedia_corpus/config.sh
-
-OUTPUT_DIR=${WORKING_DIR}/test_set_tfrecords
-mkdir -p ${OUTPUT_DIR}
-
-#SHARD_INDEX=$(( echo ${INPUT_FILE} | egrep -o [0-9]+ ))
-SHARD_INDEX=$( eval echo ${INPUT_FILE} | sed -e s/[^0-9]//g )
-OUTPUT_FILE="${OUTPUT_DIR}/tf_examples.tfrecord000${SHARD_INDEX}"
-
-SEED=13254
-
-echo "Shard index ${SHARD_INDEX}"
-
-python /workspace/bert/utils/create_pretraining_data.py \
-  --input_file=${INPUT_FILE} \
-  --output_file=${OUTPUT_FILE} \
-  --vocab_file=${VOCAB_FILE} \
-  --do_lower_case=${DO_LOWER_CASE} \
-  --max_seq_length=${MAX_SEQUENCE_LENGTH} \
-  --max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \
-  --masked_lm_prob=${MASKED_LM_PROB} \
-  --random_seed=${SEED} \
-  --dupe_factor=${DUPE_FACTOR}
-

+ 0 - 12
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/preprocessing_test_set_xargs_wrapper.sh

@@ -1,12 +0,0 @@
-#! /bin/bash
-
-source /workspace/bert/data/wikipedia_corpus/config.sh
-
-SHARD_COUNT=0
-rm -rf /workspace/bert/data/wikipedia_corpus/xarg_list.txt
-touch /workspace/bert/data/wikipedia_corpus/xarg_list.txt
-for file in /workspace/bert/data/wikipedia_corpus/test_set_text_files/*; do
-  echo ${file} >> /workspace/bert/data/wikipedia_corpus/xarg_list.txt
-done
-
-xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=/workspace/bert/data/wikipedia_corpus/xarg_list.txt /workspace/bert/data/wikipedia_corpus/preprocessing_test_set.sh

+ 0 - 13
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/preprocessing_xargs_wrapper.sh

@@ -1,13 +0,0 @@
-#! /bin/bash
-
-source /workspace/bert/data/wikipedia_corpus/config.sh
-
-SHARD_COUNT=0
-rm -rf /workspace/bert/data/wikipedia_corpus/xarg_list.txt
-touch /workspace/bert/data/wikipedia_corpus/xarg_list.txt
-for file in /workspace/bert/data/wikipedia_corpus/final_text_files_sharded/*; do
-  echo ${SHARD_COUNT} >> /workspace/bert/data/wikipedia_corpus/xarg_list.txt
-  SHARD_COUNT=$((SHARD_COUNT+1))
-done
-
-xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=/workspace/bert/data/wikipedia_corpus/xarg_list.txt /workspace/bert/data/wikipedia_corpus/preprocessing.sh

+ 0 - 30
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/remove_tags_and_clean.py

@@ -1,30 +0,0 @@
-# NVIDIA
-
-import glob
-import os
-
-output_file = os.environ['WORKING_DIR'] + '/intermediate_files/wikipedia.txt'
-
-with open(output_file, "w") as ofile:
-  for dirname in glob.glob('extracted_articles/*/', recursive=False):
-    for filename in glob.glob(dirname + 'wiki_*', recursive=True):
-      print(filename)
-      article_lines = []
-      article_open = False
-      
-      with open(filename, "r") as file:
-        for line in file:
-          if "<doc id=" in line:
-            article_open = True
-          elif "</doc>" in line:
-            article_open = False
-            for oline in article_lines[1:]:
-              if oline != "\n":
-                ofile.write(oline.rstrip() + " ")
-            ofile.write("\n\n")
-            article_lines = []
-          else:
-            if article_open:
-              article_lines.append(line)
-            
-

+ 0 - 49
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/run_preprocessing.sh

@@ -1,49 +0,0 @@
-#! /bin/bash
-
-source /workspace/bert/data/wikipedia_corpus/config.sh
-
-# Note: There are several directories created to make it clear what has been performed at each stage of preprocessing. The intermediate files may be useful if you want to further clean/prepare/augment the data for your own applications.
-# NLTK was chosen as the default over spaCy simply due to speed of sentence segmentation on the large files.
-
-# Download Wikipedia dump file
-mkdir -p ${WORKING_DIR}/download
-
-# Not using --noclobber since it emits an error if exists (incompatible with bash 'set -e')
-echo "Downloading Wikidump"
-if [ ! -f ${WORKING_DIR}/download/wikidump.xml.bz2 ]; then
-  cd ${WORKING_DIR}/download && wget -O wikidump.xml.bz2 ${WIKI_DUMP}
-fi
-
-# Extract dump
-echo "Extracting Wikidump"
-mkdir -p ${WORKING_DIR}/raw_data
-#cd ${WORKING_DIR}/raw_data && pv ${WORKING_DIR}/download/wikidump.xml.bz2 | pbzip2 -kdc > ${WORKING_DIR}/raw_data/wikidump.xml
-cd ${WORKING_DIR}/raw_data && pv ${WORKING_DIR}/download/wikidump.xml.bz2 | bunzip2 -kdc > ${WORKING_DIR}/raw_data/wikidump.xml
-#cd ${WORKING_DIR}/raw_data && bunzip2 -kdc ${WORKING_DIR}/download/wikidump.xml.bz2 > ${WORKING_DIR}/raw_data/wikidump.xml
- 
-# Wikiextractor.py - Creates lots of folders/files in "doc format"
-echo "Running Wikiextractor"
-mkdir -p ${WORKING_DIR}/extracted_articles
-/workspace/wikiextractor/WikiExtractor.py ${WORKING_DIR}/raw_data/wikidump.xml -b 1000M --processes ${N_PROCS_PREPROCESS} -o ${WORKING_DIR}/extracted_articles
-
-# Remove XML Tags and extraneous titles (since they are not sentences)
-# Also clean to remove lines between paragraphs within article and use space-separated articles
-echo "Cleaning and formatting files (one article per line)"
-mkdir -p ${WORKING_DIR}/intermediate_files
-python3 ${WORKING_DIR}/remove_tags_and_clean.py
-
-# Split articles into one-sentence-per-line format for use with BERT scripts
-echo "Applying sentence segmentation to get one sentence per line"
-mkdir -p ${WORKING_DIR}/final_text_file_single
-python3 ${WORKING_DIR}/wiki_sentence_segmentation_nltk.py
-# Note: NLTK can be replaced with Spacy, although it is slower (2 variations provided)
-
-# Shard finalized text so that it has a chance of fitting in memory when creating pretraining data into tfrecords (choose appropriate number of shards for distributed training)
-echo "Shard text files - size is approximate to prevent splitting an article across shards"
-mkdir -p ${WORKING_DIR}/final_text_files_sharded
-python3 ${WORKING_DIR}/shard_text_input_file.py
-
-# Convert sharded text files into tfrecords that are ready for BERT pretraining
-echo "Creating tfrecords for each text shard"
-mkdir -p ${WORKING_DIR}/final_tfrecords_sharded
-. ${WORKING_DIR}/preprocessing_xargs_wrapper.sh ${N_PROCS_PREPROCESS}

+ 0 - 39
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/shard_text_input_file.py

@@ -1,39 +0,0 @@
-# NVIDIA
-
-import os
-
-input_file = os.environ['WORKING_DIR'] + '/final_text_file_single/wikipedia.segmented.nltk.txt'
-output_file = os.environ['WORKING_DIR'] + '/final_text_files_sharded/wikipedia.segmented.part.'
-
-doc_seperator = "\n"
-
-line_buffer = []
-shard_size = 396000 # Approximate, will split at next article break
-line_counter = 0
-shard_index = 0
-
-ifile_lines = 0
-with open(input_file) as ifile:
-  for line in ifile:
-    ifile_lines += 1
-
-print("Input file contains", ifile_lines, "lines.")
-
-iline_counter = 1
-with open(input_file) as ifile:
-  for line in ifile:
-    if line_counter < shard_size and iline_counter < ifile_lines:
-      line_buffer.append(line)
-      line_counter += 1
-      iline_counter += 1
-    elif line_counter >= shard_size and line != "\n" and iline_counter < ifile_lines:
-      line_buffer.append(line)
-      line_counter += 1
-      iline_counter += 1
-    else:
-       with open(output_file + str(shard_index) + ".txt", "w") as ofile:
-         for oline in line_buffer:
-           ofile.write(oline)
-         line_buffer = []
-         line_counter = 0
-         shard_index += 1

+ 0 - 20
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/wiki_sentence_segmentation_nltk.py

@@ -1,20 +0,0 @@
-# NVIDIA
-
-import nltk
-import os
-
-nltk.download('punkt')
-
-input_file = os.environ['WORKING_DIR'] + '/intermediate_files/wikipedia.txt'
-output_file = os.environ['WORKING_DIR'] + '/final_text_file_single/wikipedia.segmented.nltk.txt'
-
-doc_seperator = "\n"
-
-with open(input_file) as ifile:
-  with open(output_file, "w") as ofile:
-    for line in ifile:
-      if line != "\n":
-        sent_list = nltk.tokenize.sent_tokenize(line)
-        for sent in sent_list:
-          ofile.write(sent + "\n")
-        ofile.write(doc_seperator)

+ 0 - 22
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/wiki_sentence_segmentation_spacy.py

@@ -1,22 +0,0 @@
-# NVIDIA
-
-import os
-import spacy
-
-#spacy.prefer_gpu()
-spacy.require_gpu()
-
-input_file = os.environ['WORKING_DIR'] + '/intermediate_files/wikipedia.txt'
-output_file = os.environ['WORKING_DIR'] + '/final_test_file_single/wikipedia.segmented.txt'
-
-nlp = spacy.load('en_core_web_sm')
-
-doc_seperator = "\n"
-
-with open(input_file) as ifile:
-  with open(output_file, "w") as ofile:
-    for line in ifile:
-      if line != "\n":
-        doc = nlp(line)
-        for sent in doc.sents:
-          ofile.write(sent.text + "\n")

+ 0 - 33
TensorFlow/LanguageModeling/BERT/data/wikipedia_corpus/wiki_sentence_segmentation_spacy_pipe.py

@@ -1,33 +0,0 @@
-# NVIDIA
-
-import os
-import spacy
-
-#spacy.prefer_gpu()
-spacy.require_gpu()
-
-input_file = os.environ['WORKING_DIR'] + '/intermediate_files/wikipedia.txt'
-output_file = os.environ['WORKING_DIR'] + '/final_test_file_single/wikipedia.segmented.txt'
-
-nlp = spacy.load('en_core_web_sm')
-
-doc_seperator = "\n"
-
-file_mem = []
-
-print("Reading file into memory.")
-with open(input_file) as ifile:
-  for line in ifile:
-    if line != "\n":
-      file_mem.append(line)
-
-print("File read.")
-print("Starting nlp.pipe")
-docs = nlp.pipe(file_mem, batch_size=1000)
-
-print("Starting to write output")
-with open(output_file, "w") as ofile:
-  for item in docs:
-    for sent in item.sents:
-      if sent.text != "\n":
-        ofile.write(sent.text + "\n")

+ 244 - 43
TensorFlow/LanguageModeling/BERT/optimization.py

@@ -1,4 +1,5 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Functions and classes related to optimization (weight updates)."""
 
 from __future__ import absolute_import
@@ -20,14 +22,25 @@ from __future__ import print_function
 
 import re
 import tensorflow as tf
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from horovod.tensorflow.compression import Compression
 
-
-def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None, manual_fp16=False, use_fp16=False):
+def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None, manual_fp16=False, use_fp16=False, num_accumulation_steps=1,
+                     optimizer_type="adam", allreduce_post_accumulation=False):
   """Creates an optimizer training op."""
   global_step = tf.train.get_or_create_global_step()
-
+  
   # avoid step change in learning rate at end of warmup phase
-  decayed_learning_rate_at_crossover_point = init_lr * (1.0-float(num_warmup_steps)/float(num_train_steps))
+  if optimizer_type == "adam":
+      power = 1.0
+      decayed_learning_rate_at_crossover_point = init_lr * (
+                  (1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power)
+  else:
+      power = 0.5
+      decayed_learning_rate_at_crossover_point = init_lr
+
   adjusted_init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
   print('decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' % (decayed_learning_rate_at_crossover_point, adjusted_init_lr))
 
@@ -39,7 +52,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None,
       global_step,
       num_train_steps,
       end_learning_rate=0.0,
-      power=1.0,
+      power=power,
       cycle=False)
 
   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
@@ -58,49 +71,120 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None,
     learning_rate = (
         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 
-  # It is recommended that you use this optimizer for fine tuning, since this
-  # is how the model was trained (note that the Adam m/v variables are NOT
-  # loaded from init_checkpoint.)
-  optimizer = AdamWeightDecayOptimizer(
-      learning_rate=learning_rate,
-      weight_decay_rate=0.01,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-6,
-      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
-
-  if hvd is not None:
-    from horovod.tensorflow.compression import Compression
-    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none)
+  if optimizer_type == "lamb":
+      print("Initializing LAMB Optimizer")
+      optimizer = LAMBOptimizer(
+          learning_rate=learning_rate,
+          weight_decay_rate=0.01,
+          beta_1=0.9,
+          beta_2=0.999,
+          epsilon=1e-6,
+          exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+  else:
+      print("Initializing ADAM Weight Decay Optimizer")
+      # It is recommended that you use this optimizer for fine tuning, since this
+      # is how the model was trained (note that the Adam m/v variables are NOT
+      # loaded from init_checkpoint.)
+      optimizer = AdamWeightDecayOptimizer(
+          learning_rate=learning_rate,
+          weight_decay_rate=0.01,
+          beta_1=0.9,
+          beta_2=0.999,
+          epsilon=1e-6,
+          exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+
+  if hvd is not None and (num_accumulation_steps == 1 or (not allreduce_post_accumulation)):
+    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)
   if manual_fp16 or use_fp16:
     loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
     optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
 
   tvars = tf.trainable_variables()
-  grads_and_vars = optimizer.compute_gradients(loss, tvars)
-  grads_and_vars = [(g,v) for g,v in grads_and_vars if g is not None]
-  grads, tvars = list(zip(*grads_and_vars))
-  all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if manual_fp16 or use_fp16 else tf.constant(True, dtype=tf.bool)
-
-  # This is how the model was pre-trained.
-  # ensure global norm is a finite number 
-  # to prevent clip_by_global_norm from having a hizzy fit.
-  (clipped_grads, _) = tf.clip_by_global_norm(
-        grads, clip_norm=1.0, 
-        use_norm=tf.cond(
-            all_are_finite,
-            lambda: tf.global_norm(grads),
-            lambda: tf.constant(1.0)))
-
-  train_op = optimizer.apply_gradients(
-      list(zip(clipped_grads, tvars)), global_step=global_step)
-
-  # Normally the global step update is done inside of `apply_gradients`.
-  # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
-  # a different optimizer, you should probably take this line out.
-  new_global_step = tf.cond(all_are_finite, lambda: global_step+1, lambda: global_step)
-  new_global_step = tf.identity(new_global_step, name='step_update')
-  train_op = tf.group(train_op, [global_step.assign(new_global_step)])
+  grads_and_vars = optimizer.compute_gradients(loss * 1.0 / num_accumulation_steps, tvars)
+
+  if num_accumulation_steps > 1:
+      local_step = tf.get_variable(name="local_step", shape=[], dtype=tf.int32, trainable=False,
+                                   initializer=tf.zeros_initializer)
+      batch_finite = tf.get_variable(name="batch_finite", shape=[], dtype=tf.bool, trainable=False,
+                                     initializer=tf.ones_initializer)
+      accum_vars = [tf.get_variable(
+          name=tvar.name.split(":")[0] + "/accum",
+          shape=tvar.shape.as_list(),
+          dtype=tf.float32,
+          trainable=False,
+          initializer=tf.zeros_initializer()) for tvar in tf.trainable_variables()]
+
+      reset_step = tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool)
+      local_step = tf.cond(reset_step, lambda:local_step.assign(tf.ones_like(local_step)), lambda:local_step.assign_add(1))
+
+      grads_and_vars_and_accums = [(gv[0],gv[1],accum_vars[i]) for i, gv in enumerate(grads_and_vars) if gv[0] is not None]
+      grads, tvars, accum_vars = list(zip(*grads_and_vars_and_accums))
+
+      all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if manual_fp16 or use_fp16 else tf.constant(True, dtype=tf.bool)
+      batch_finite = tf.cond(reset_step,
+        lambda: batch_finite.assign(tf.math.logical_and(tf.constant(True, dtype=tf.bool), all_are_finite)),
+        lambda:batch_finite.assign(tf.math.logical_and(batch_finite, all_are_finite)))
+
+      # This is how the model was pre-trained.
+      # ensure global norm is a finite number
+      # to prevent clip_by_global_norm from having a hizzy fit.
+      (clipped_grads, _) = tf.clip_by_global_norm(
+            grads, clip_norm=1.0,
+            use_norm=tf.cond(
+                all_are_finite,
+                lambda: tf.global_norm(grads),
+                lambda: tf.constant(1.0)))
+
+      accum_vars = tf.cond(reset_step,
+              lambda: [accum_vars[i].assign(grad) for i, grad in enumerate(clipped_grads)],
+              lambda: [accum_vars[i].assign_add(grad) for i, grad in enumerate(clipped_grads)])
+
+      def update(accum_vars):
+          if allreduce_post_accumulation and hvd is not None:
+              accum_vars = [hvd.allreduce(tf.convert_to_tensor(accum_var), compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) if isinstance(accum_var, tf.IndexedSlices)
+                            else hvd.allreduce(accum_var, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) for accum_var in accum_vars]
+          return optimizer.apply_gradients(list(zip(accum_vars, tvars)), global_step=global_step)
+
+      update_step = tf.identity(tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool), name="update_step")
+      update_op = tf.cond(update_step,
+                          lambda: update(accum_vars), lambda: tf.no_op())
+
+      # Normally the global step update is done inside of `apply_gradients`.
+      # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
+      # a different optimizer, you should probably take this line out.
+      # new_global_step = tf.identity(tf.cond(tf.math.logical_and(update_step, batch_finite), lambda: global_step.assign_add(1), lambda: global_step.assign(global_step)), name='step_update')
+      # train_op = tf.group(update_op, new_global_step)
+      new_global_step = tf.cond(tf.math.logical_and(update_step, batch_finite), lambda: global_step+1, lambda: global_step)
+      new_global_step = tf.identity(new_global_step, name='step_update')
+      train_op = tf.group(update_op, [global_step.assign(new_global_step)])
+  else:
+      grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
+      grads, tvars = list(zip(*grads_and_vars))
+      all_are_finite = tf.reduce_all(
+          [tf.reduce_all(tf.is_finite(g)) for g in grads]) if use_fp16 or manual_fp16 else tf.constant(True, dtype=tf.bool)
+
+      # This is how the model was pre-trained.
+      # ensure global norm is a finite number
+      # to prevent clip_by_global_norm from having a hizzy fit.
+      (clipped_grads, _) = tf.clip_by_global_norm(
+          grads, clip_norm=1.0,
+          use_norm=tf.cond(
+              all_are_finite,
+              lambda: tf.global_norm(grads),
+              lambda: tf.constant(1.0)))
+
+      train_op = optimizer.apply_gradients(
+          list(zip(clipped_grads, tvars)), global_step=global_step)
+
+      # Normally the global step update is done inside of `apply_gradients`.
+      # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
+      # a different optimizer, you should probably take this line out.
+      new_global_step = tf.cond(all_are_finite, lambda: global_step + 1, lambda: global_step)
+      new_global_step = tf.identity(new_global_step, name='step_update')
+      train_op = tf.group(train_op, [global_step.assign(new_global_step)])
+
+      # new_global_step = tf.identity(tf.cond(all_are_finite, lambda: global_step.assign_add(1), lambda: global_step.assign(global_step)), name='step_update')
+      # train_op = tf.group(update_op, new_global_step)
   return train_op
 
 
@@ -206,3 +290,120 @@ class AdamWeightDecayOptimizer(tf.train.Optimizer):
     if m is not None:
       param_name = m.group(1)
     return param_name
+
+
+class LAMBOptimizer(tf.train.Optimizer):
+  """A LAMB optimizer that includes "correct" L2 weight decay."""
+
+  def __init__(self,
+               learning_rate,
+               weight_decay_rate=0.0,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-6,
+               exclude_from_weight_decay=None,
+               name="LAMBOptimizer"):
+    """Constructs a LAMBOptimizer."""
+    super(LAMBOptimizer, self).__init__(False, name)
+
+    self.learning_rate = tf.identity(learning_rate, name='learning_rate')
+    self.weight_decay_rate = weight_decay_rate
+    self.beta_1 = beta_1
+    self.beta_2 = beta_2
+    self.epsilon = epsilon
+    self.exclude_from_weight_decay = exclude_from_weight_decay
+    self.steps = 0
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None,
+      manual_fp16=False):
+    """See base class."""
+    assignments = []
+    for (grad, param) in grads_and_vars:
+      if grad is None or param is None:
+        continue
+
+      param_name = self._get_variable_name(param.name)
+      has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
+      if has_shadow:
+        # create shadow fp32 weights for fp16 variable
+        param_fp32 = tf.get_variable(
+            name=param_name + "/shadow",
+            dtype=tf.float32,
+            trainable=False,
+            initializer=tf.cast(param.initialized_value(),tf.float32))
+      else:
+        param_fp32 = param
+
+      m = tf.get_variable(
+          name=param_name + "/adam_m",
+          shape=param.shape.as_list(),
+          dtype=tf.float32,
+          trainable=False,
+          initializer=tf.zeros_initializer())
+      v = tf.get_variable(
+          name=param_name + "/adam_v",
+          shape=param.shape.as_list(),
+          dtype=tf.float32,
+          trainable=False,
+          initializer=tf.zeros_initializer())
+
+      # LAMB update
+      next_m = (
+          tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+      next_v = (
+          tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                    tf.square(grad)))
+
+      self.steps += 1
+      beta1_correction = (1 - self.beta_1 ** self.steps)
+      beta2_correction = (1 - self.beta_2 ** self.steps)
+
+      next_m_unbiased = next_m / beta1_correction
+      next_v_unbiased = next_v / beta2_correction
+
+      update = next_m_unbiased / (tf.sqrt(next_v_unbiased) + self.epsilon)
+
+      # Just adding the square of the weights to the loss function is *not*
+      # the correct way of using L2 regularization/weight decay with Adam,
+      # since that will interact with the m and v parameters in strange ways.
+      #
+      # Instead we want ot decay the weights in a manner that doesn't interact
+      # with the m/v parameters. This is equivalent to adding the square
+      # of the weights to the loss with plain (non-momentum) SGD.
+      if self._do_use_weight_decay(param_name):
+        update += self.weight_decay_rate * param_fp32
+
+      w_norm = linalg_ops.norm(param, ord=2)
+      g_norm = linalg_ops.norm(update, ord=2)
+      ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
+          math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
+
+      update_with_lr = ratio * self.learning_rate * update
+
+      next_param = param_fp32 - update_with_lr
+
+      if has_shadow:
+        # cast shadow fp32 weights to fp16 and assign to trainable variable
+        param.assign(tf.cast(next_param, param.dtype.base_dtype))
+      assignments.extend(
+          [param_fp32.assign(next_param),
+           m.assign(next_m),
+           v.assign(next_v)])
+    return tf.group(*assignments, name=name)
+
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if not self.weight_decay_rate:
+      return False
+    if self.exclude_from_weight_decay:
+      for r in self.exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def _get_variable_name(self, param_name):
+    """Get the variable name from the tensor name."""
+    m = re.match("^(.*):\\d+$", param_name)
+    if m is not None:
+      param_name = m.group(1)
+    return param_name

+ 73 - 0
TensorFlow/LanguageModeling/BERT/run.sub

@@ -0,0 +1,73 @@
+#!/bin/bash
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --overcommit
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eux
+
+readonly docker_image="nvcr.io/nvidia/tensorflow:19.08-py3"
+readonly datadir="/raid/data/bert"
+readonly checkpointdir="$PWD/checkpoints"
+
+readonly mounts=".:/workspace/bert,${datadir}:/workspace/bert/data,${checkpointdir}:/results"
+
+
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/phase_1"
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/phase_2"
+
+PHASE1="\
+     --train_batch_size=${BATCHSIZE:-16} \
+     --learning_rate=${LEARNING_RATE:-1.875e-4} \
+     --num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-128} \
+     --input_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training \
+     --eval_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/test \
+     --max_seq_length=128 \
+     --max_predictions_per_seq=20 \
+     --num_train_steps=7038 \
+     --num_warmup_steps=2000 \
+     --output_dir=/results/phase_1 \
+     "
+
+PHASE2="\
+     --train_batch_size=${BATCHSIZE:-2} \
+     --learning_rate=${LEARNING_RATE:-1.25e-4} \
+     --num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-512} \
+     --input_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training \
+     --eval_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/test \
+     --max_seq_length=512 \
+     --max_predictions_per_seq=80 \
+     --num_train_steps=1564 \
+     --num_warmup_steps=200 \
+     --output_dir=/results/phase_2 \
+     --init_checkpoint=/results/phase_1/model.ckpt-7038 \
+    "
+
+PHASES=( "$PHASE1" "$PHASE2" )
+
+PHASE=${PHASE:-1}
+
+BERT_CMD="\
+    python /workspace/bert/run_pretraining.py \
+     ${PHASES[$((PHASE-1))]} \
+     --bert_config_file=/workspace/bert/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json \
+     --do_train=True \
+     --do_eval=True \
+     --save_checkpoints_steps=100 \
+     --horovod --use_fp16 --use_xla \
+     --allreduce_post_accumulation=True \
+     --eval_batch_size=8"
+
+srun --mpi=pmi2 -l --container-image="${docker_image}" --container-mounts="${mounts}" bash -c "${BERT_CMD}"

+ 32 - 27
TensorFlow/LanguageModeling/BERT/run_classifier.py

@@ -1,4 +1,5 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """BERT finetuning runner."""
 
 from __future__ import absolute_import
@@ -103,7 +105,9 @@ flags.DEFINE_integer("save_checkpoints_steps", 1000,
 
 flags.DEFINE_integer("iterations_per_loop", 1000,
                      "How many steps to make in each estimator call.")
-
+flags.DEFINE_integer("num_accumulation_steps", 1,
+                     "Number of accumulation steps before gradient update" 
+                      "Global batch size = num_accumulation_steps * train_batch_size")
 flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
 
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
@@ -264,7 +268,7 @@ def get_frozen_tftrt_model(bert_config, shape, num_labels, use_one_hot_embedding
 
 
 
-def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
+def model_fn_builder(task_name, bert_config, num_labels, init_checkpoint, learning_rate,
                      num_train_steps, num_warmup_steps,
                      use_one_hot_embeddings, hvd=None):
   """Returns `model_fn` closure for Estimator."""
@@ -272,6 +276,25 @@ def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
     """The `model_fn` for Estimator."""
 
+    def metric_fn(per_example_loss, label_ids, logits):
+        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
+        if task_name == "cola":
+            FN, FN_op = tf.metrics.false_negatives(labels=label_ids, predictions=predictions)
+            FP, FP_op = tf.metrics.false_positives(labels=label_ids, predictions=predictions)
+            TP, TP_op = tf.metrics.true_positives(labels=label_ids, predictions=predictions)
+            TN, TN_op = tf.metrics.true_negatives(labels=label_ids, predictions=predictions)
+
+            MCC = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) ** 0.5
+            MCC_op = tf.group(FN_op, TN_op, TP_op, FP_op, tf.identity(MCC, name="MCC"))
+            return {"MCC": (MCC, MCC_op)}
+        else:
+            accuracy = tf.metrics.accuracy(
+                labels=label_ids, predictions=predictions)
+            loss = tf.metrics.mean(values=per_example_loss)
+            return {
+                "eval_accuracy": accuracy,
+                "eval_loss": loss,
+            }
     tf.logging.info("*** Features ***")
     for name in sorted(features.keys()):
       tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
@@ -294,16 +317,6 @@ def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
             output_spec = tf.estimator.EstimatorSpec(
                 mode=mode, predictions=predictions)
         elif mode == tf.estimator.ModeKeys.EVAL:
-            def metric_fn(per_example_loss, label_ids, logits):
-              predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
-              accuracy = tf.metrics.accuracy(
-                  labels=label_ids, predictions=predictions)
-              loss = tf.metrics.mean(values=per_example_loss)
-              return {
-                  "eval_accuracy": accuracy,
-                  "eval_loss": loss,
-              }
-
             eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
             output_spec = tf.estimator.EstimatorSpec(
                 mode=mode,
@@ -335,23 +348,13 @@ def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
 
       train_op = optimization.create_optimizer(
           total_loss, learning_rate, num_train_steps, num_warmup_steps,
-          hvd, FLAGS.use_fp16)
+          hvd, False, FLAGS.use_fp16, FLAGS.num_accumulation_steps)
 
       output_spec = tf.estimator.EstimatorSpec(
           mode=mode,
           loss=total_loss,
           train_op=train_op)
     elif mode == tf.estimator.ModeKeys.EVAL:
-
-      def metric_fn(per_example_loss, label_ids, logits):
-        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
-        accuracy = tf.metrics.accuracy(label_ids, predictions)
-        loss = tf.metrics.mean(per_example_loss)
-        return {
-            "eval_accuracy": accuracy,
-            "eval_loss": loss,
-        }
-
       eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
       output_spec = tf.estimator.EstimatorSpec(
           mode=mode,
@@ -424,7 +427,8 @@ def main(_):
 
   if FLAGS.horovod:
     hvd.init()
-
+  if FLAGS.use_fp16:
+    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
   processors = {
       "cola": ColaProcessor,
       "mnli": MnliProcessor,
@@ -460,7 +464,7 @@ def main(_):
 
   master_process = True
   training_hooks = []
-  global_batch_size = FLAGS.train_batch_size
+  global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
   hvd_rank = 0
 
   config = tf.ConfigProto()
@@ -468,7 +472,7 @@ def main(_):
 
       tf.logging.info("Multi-GPU training with TF Horovod")
       tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank())
-      global_batch_size = FLAGS.train_batch_size * hvd.size()
+      global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
       master_process = (hvd.rank() == 0)
       hvd_rank = hvd.rank()
       config.gpu_options.allow_growth = True
@@ -517,6 +521,7 @@ def main(_):
         end_index = start_index + (num_examples_per_rank)
 
   model_fn = model_fn_builder(
+      task_name=task_name,
       bert_config=bert_config,
       num_labels=len(label_list),
       init_checkpoint=FLAGS.init_checkpoint,
@@ -700,4 +705,4 @@ if __name__ == "__main__":
   flags.mark_flag_as_required("vocab_file")
   flags.mark_flag_as_required("bert_config_file")
   flags.mark_flag_as_required("output_dir")
-  tf.app.run()
+  tf.app.run()

+ 124 - 69
TensorFlow/LanguageModeling/BERT/run_pretraining.py

@@ -1,4 +1,5 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Run masked LM/next sentence masked_lm pre-training for BERT."""
 
 from __future__ import absolute_import
@@ -23,6 +25,7 @@ import time
 import modeling
 import optimization
 import tensorflow as tf
+import glob
 
 flags = tf.flags
 
@@ -35,8 +38,12 @@ flags.DEFINE_string(
     "This specifies the model architecture.")
 
 flags.DEFINE_string(
-    "input_file", None,
-    "Input TF example files (can be a glob or comma separated).")
+    "input_files_dir", None,
+    "Directory with input files, comma separated or single directory.")
+
+flags.DEFINE_string(
+    "eval_files_dir", None,
+    "Directory with eval files, comma separated or single directory. ")
 
 flags.DEFINE_string(
     "output_dir", None,
@@ -47,6 +54,10 @@ flags.DEFINE_string(
     "init_checkpoint", None,
     "Initial checkpoint (usually from a pre-trained BERT model).")
 
+flags.DEFINE_string(
+    "optimizer_type", "lamb",
+    "Optimizer used for training - LAMB or ADAM")
+
 flags.DEFINE_integer(
     "max_seq_length", 512,
     "The maximum total input sequence length after WordPiece tokenization. "
@@ -74,15 +85,27 @@ flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
 
 flags.DEFINE_integer("save_checkpoints_steps", 1000,
                      "How often to save the model checkpoint.")
+flags.DEFINE_integer("display_loss_steps", 10,
+                     "How often to print loss")
 
 flags.DEFINE_integer("iterations_per_loop", 1000,
                      "How many steps to make in each estimator call.")
 
 flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
 
+flags.DEFINE_integer("num_accumulation_steps", 1,
+                     "Number of accumulation steps before gradient update." 
+                      "Global batch size = num_accumulation_steps * train_batch_size")
+
+flags.DEFINE_bool("allreduce_post_accumulation", False, "Whether to all reduce after accumulation of N steps or after each step")
+
+flags.DEFINE_bool(
+    "verbose_logging", False,
+    "If true, all of the trainable parameters are printed")
+
 flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
 
-flags.DEFINE_bool("report_loss", False, "Whether to report total loss during training.")
+flags.DEFINE_bool("report_loss", True, "Whether to report total loss during training.")
 
 flags.DEFINE_bool("manual_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU. "
                                         "Manual casting is done instead of using AMP")
@@ -93,52 +116,83 @@ flags.DEFINE_bool("use_fp16", False, "Whether to enable AMP ops.")
 
 # report samples/sec, total loss and learning rate during training
 class _LogSessionRunHook(tf.train.SessionRunHook):
-  def __init__(self, global_batch_size, display_every=10, hvd_rank=-1):
+  def __init__(self, global_batch_size, num_accumulation_steps, display_every=10, hvd_rank=-1):
     self.global_batch_size = global_batch_size
     self.display_every = display_every
     self.hvd_rank = hvd_rank
+    self.num_accumulation_steps = num_accumulation_steps
   def after_create_session(self, session, coord):
     self.elapsed_secs = 0.
     self.count = 0
+    self.all_count = 0
+    self.avg_loss = 0.0
+
   def before_run(self, run_context):
     self.t0 = time.time()
-    if FLAGS.manual_fp16 or FLAGS.use_fp16:
-      return tf.train.SessionRunArgs(
-          fetches=['step_update:0', 'total_loss:0',
-                   'learning_rate:0', 'nsp_loss:0',
-                   'mlm_loss:0', 'loss_scale:0'])
+    if self.num_accumulation_steps <= 1:
+        if FLAGS.manual_fp16 or FLAGS.use_fp16:
+            return tf.train.SessionRunArgs(
+                fetches=['step_update:0', 'total_loss:0',
+                         'learning_rate:0', 'nsp_loss:0',
+                         'mlm_loss:0', 'loss_scale:0'])
+        else:
+            return tf.train.SessionRunArgs(
+                fetches=['step_update:0', 'total_loss:0',
+                         'learning_rate:0', 'nsp_loss:0',
+                         'mlm_loss:0'])
     else:
-      return tf.train.SessionRunArgs(
-          fetches=['step_update:0', 'total_loss:0',
-                   'learning_rate:0', 'nsp_loss:0',
-                   'mlm_loss:0'])
+        if FLAGS.manual_fp16 or FLAGS.use_fp16:
+          return tf.train.SessionRunArgs(
+              fetches=['step_update:0', 'update_step:0', 'total_loss:0',
+                       'learning_rate:0', 'nsp_loss:0',
+                       'mlm_loss:0', 'loss_scale:0'])
+        else:
+          return tf.train.SessionRunArgs(
+              fetches=['step_update:0', 'update_step:0', 'total_loss:0',
+                       'learning_rate:0', 'nsp_loss:0',
+                       'mlm_loss:0'])
   def after_run(self, run_context, run_values):
     self.elapsed_secs += time.time() - self.t0
-    self.count += 1
-    if FLAGS.manual_fp16 or FLAGS.use_fp16:
-      global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
+    if self.num_accumulation_steps <=1:
+        if FLAGS.manual_fp16 or FLAGS.use_fp16:
+            global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
+        else:
+            global_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
+                results
+        update_step = True
     else:
-      global_step, total_loss, lr, nsp_loss, mlm_loss = run_values.results
-    print_step = global_step + 1 # One-based index for printing.
-    if print_step == 1 or print_step % self.display_every == 0:
-        dt = self.elapsed_secs / self.count
-        img_per_sec = self.global_batch_size / dt
-        if self.hvd_rank >= 0:
-          if FLAGS.manual_fp16 or FLAGS.use_fp16:
-            print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %6.3f LR = %6.4e Loss scale = %6.4e' %
-                  (self.hvd_rank, print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr, loss_scaler))
-          else:
-            print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %6.3f LR = %6.4e' %
-                  (self.hvd_rank, print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr))
+        if FLAGS.manual_fp16 or FLAGS.use_fp16:
+          global_step, update_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
         else:
-          if FLAGS.manual_fp16 or FLAGS.use_fp16:
-            print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %6.3f LR = %6.4e Loss scale = %6.4e' %
-                  (print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr, loss_scaler))
-          else:
-            print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %6.3f LR = %6.4e' %
-                  (print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr))
-        self.elapsed_secs = 0.
-        self.count = 0
+          global_step, update_step, total_loss, lr, nsp_loss, mlm_loss = run_values.\
+              results
+    print_step = global_step + 1 # One-based index for printing.
+    self.avg_loss += total_loss
+    self.all_count += 1
+    if update_step:
+        self.count += 1
+        if (print_step == 1 or print_step % self.display_every == 0):
+            dt = self.elapsed_secs / self.count
+            sent_per_sec = self.global_batch_size / dt
+            avg_loss_step = self.avg_loss / self.all_count
+            if self.hvd_rank >= 0:
+              if FLAGS.manual_fp16 or FLAGS.use_fp16:
+                print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %6.3f Average Loss = %6.3f LR = %6.4e Loss scale = %6.4e' %
+                      (self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler))
+              else:
+                print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %6.3f Average Loss = %6.3f LR = %6.4e' %
+                      (self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr))
+            else:
+              if FLAGS.manual_fp16 or FLAGS.use_fp16:
+                print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %6.3f Average Loss = %6.3f LR = %6.4e Loss scale = %6.4e' %
+                      (print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler))
+              else:
+                print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %6.3f Average Loss = %6.3f LR = %6.4e' %
+                      (print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr))
+            self.elapsed_secs = 0.
+            self.count = 0
+            self.avg_loss = 0.0
+            self.all_count = 0
 
 def model_fn_builder(bert_config, init_checkpoint, learning_rate,
                      num_train_steps, num_warmup_steps,
@@ -195,19 +249,20 @@ def model_fn_builder(bert_config, init_checkpoint, learning_rate,
 
       tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
 
-    tf.logging.info("**** Trainable Variables ****")
-    for var in tvars:
-      init_string = ""
-      if var.name in initialized_variable_names:
-        init_string = ", *INIT_FROM_CKPT*"
-      tf.logging.info("  %d :: name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape,
-                      init_string)
+    if FLAGS.verbose_logging:
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+          init_string = ""
+          if var.name in initialized_variable_names:
+            init_string = ", *INIT_FROM_CKPT*"
+          tf.logging.info("  %d :: name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape,
+                          init_string)
 
     output_spec = None
     if mode == tf.estimator.ModeKeys.TRAIN:
       train_op = optimization.create_optimizer(
           total_loss, learning_rate, num_train_steps, num_warmup_steps,
-          hvd, FLAGS.manual_fp16, FLAGS.use_fp16)
+          hvd, FLAGS.manual_fp16, FLAGS.use_fp16, FLAGS.num_accumulation_steps, FLAGS.optimizer_type, FLAGS.allreduce_post_accumulation)
 
       output_spec = tf.estimator.EstimatorSpec(
           mode=mode,
@@ -453,27 +508,28 @@ def main(_):
   tf.gfile.MakeDirs(FLAGS.output_dir)
 
   input_files = []
-  for input_pattern in FLAGS.input_file.split(","):
-    input_files.extend(tf.gfile.Glob(input_pattern))
+  for input_file_dir in FLAGS.input_files_dir.split(","):
+    input_files.extend(tf.gfile.Glob(os.path.join(input_file_dir, "*")))
 
-  tf.logging.info("*** Input Files ***")
-  for input_file in input_files:
-    tf.logging.info("  %s" % input_file)
+  if FLAGS.horovod and len(input_files) < hvd.size():
+      raise ValueError("Input Files must be sharded")
+  if FLAGS.use_fp16 and FLAGS.manual_fp16:
+      raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")
 
-  config = tf.ConfigProto()
-  if FLAGS.horovod: 
-    config.gpu_options.visible_device_list = str(hvd.local_rank())
-    if len(input_files) < hvd.size():
-        raise ValueError("Input Files must be sharded")
-  if FLAGS.use_xla: 
-    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
   is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
   config = tf.ConfigProto()
   if FLAGS.horovod:
     config.gpu_options.visible_device_list = str(hvd.local_rank())
     config.gpu_options.allow_growth = True
+    if hvd.rank() == 0:
+      tf.logging.info("***** Configuaration *****")
+      for key in FLAGS.__flags.keys():
+          tf.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
+      tf.logging.info("**************************")
+
 #    config.gpu_options.per_process_gpu_memory_fraction = 0.7
-  if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1	  
+  if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+
   run_config = tf.estimator.RunConfig(
       model_dir=FLAGS.output_dir,
       session_config=config,
@@ -494,18 +550,11 @@ def main(_):
       use_one_hot_embeddings=False,
       hvd=None if not FLAGS.horovod else hvd)
 
-  training_hooks = []
-  if FLAGS.horovod and hvd.size() > 1:
-    training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
-  if FLAGS.report_loss:
-    global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size*hvd.size()
-    training_hooks.append(_LogSessionRunHook(global_batch_size,1,-1 if not FLAGS.horovod else hvd.rank()))
-
   training_hooks = []
   if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
-    global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size*hvd.size()
-    training_hooks.append(_LogSessionRunHook(global_batch_size,100))
-  if FLAGS.horovod:
+    global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
+    training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
+  if FLAGS.horovod and hvd.size() > 1:
     training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
 
   estimator = tf.estimator.Estimator(
@@ -522,14 +571,19 @@ def main(_):
         max_predictions_per_seq=FLAGS.max_predictions_per_seq,
         is_training=True,
         hvd=None if not FLAGS.horovod else hvd)
+
     estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps)
 
   if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
     tf.logging.info("***** Running evaluation *****")
     tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
 
+    eval_files = []
+    for eval_file_dir in FLAGS.eval_files_dir.split(","):
+        eval_files.extend(tf.gfile.Glob(os.path.join(eval_file_dir, "*")))
+
     eval_input_fn = input_fn_builder(
-        input_files=input_files,
+        input_files=eval_files,
         batch_size=FLAGS.eval_batch_size,
         max_seq_length=FLAGS.max_seq_length,
         max_predictions_per_seq=FLAGS.max_predictions_per_seq,
@@ -548,7 +602,8 @@ def main(_):
 
 
 if __name__ == "__main__":
-  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("input_files_dir")
+  flags.mark_flag_as_required("eval_files_dir")
   flags.mark_flag_as_required("bert_config_file")
   flags.mark_flag_as_required("output_dir")
   if FLAGS.use_xla and FLAGS.manual_fp16:

+ 0 - 19
TensorFlow/LanguageModeling/BERT/run_pretraining.sh

@@ -1,19 +0,0 @@
-#! /bin/bash
-
-mpiexec --allow-run-as-root --bind-to socket -np 8 python3 run_pretraining.py \
-  --input_file=/workspace/data/bert_large_wikipedia_seq_512_pred_20/tf_examples.tfrecord* \
-  --output_dir=/workspace/checkpoints/pretraining_base_output \
-  --do_train=True \
-  --do_eval=True \
-  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
-  --train_batch_size=14 \
-  --max_seq_length=512 \
-  --max_predictions_per_seq=20 \
-  --num_train_steps=250000 \
-  --num_warmup_steps=10000 \
-  --learning_rate=1e-4 \
-  --use_fp16 \
-  --use_xla \
-  --report_loss \
-  --horovod
-

+ 11 - 3
TensorFlow/LanguageModeling/BERT/run_squad.py

@@ -1,4 +1,5 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Run BERT on SQuAD 1.1 and SQuAD 2.0."""
 
 from __future__ import absolute_import, division, print_function
@@ -114,6 +116,10 @@ flags.DEFINE_integer("save_checkpoints_steps", 1000,
 flags.DEFINE_integer("iterations_per_loop", 1000,
                      "How many steps to make in each estimator call.")
 
+flags.DEFINE_integer("num_accumulation_steps", 1,
+                     "Number of accumulation steps before gradient update" 
+                      "Global batch size = num_accumulation_steps * train_batch_size")
+
 flags.DEFINE_integer(
     "n_best_size", 20,
     "The total number of n-best predictions to generate in the "
@@ -336,7 +342,7 @@ def model_fn_builder(bert_config, init_checkpoint, learning_rate,
       total_loss = (start_loss + end_loss) / 2.0
 
       train_op = optimization.create_optimizer(
-          total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, amp=use_fp16)
+          total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, use_fp16, FLAGS.num_accumulation_steps)
 
       output_spec = tf.estimator.EstimatorSpec(
           mode=mode,
@@ -899,6 +905,8 @@ def main(_):
 
   if FLAGS.horovod:
     hvd.init()
+  if FLAGS.use_fp16:
+    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
 
   bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
 
@@ -911,7 +919,7 @@ def main(_):
 
   master_process = True
   training_hooks = []
-  global_batch_size = FLAGS.train_batch_size
+  global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
   hvd_rank = 0
   hvd_local_rank = 0
 
@@ -921,7 +929,7 @@ def main(_):
 
       tf.logging.info("Multi-GPU training with TF Horovod")
       tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank())
-      global_batch_size = FLAGS.train_batch_size * hvd.size()
+      global_batch_size = FLAGS.train_batch_size * hvd.size() * FLAGS.num_accumulation_steps
       learning_rate = learning_rate * hvd.size()
       master_process = (hvd.rank() == 0)
       hvd_rank = hvd.rank()

+ 13 - 0
TensorFlow/LanguageModeling/BERT/run_squad_trtis_client.py

@@ -1,3 +1,16 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import modeling
 import tokenization
 from tensorrtserver.api import ProtocolType, InferContext, ServerStatusContext, grpc_service_pb2_grpc, grpc_service_pb2, model_config_pb2

+ 14 - 1
TensorFlow/LanguageModeling/BERT/scripts/data_download.sh

@@ -1,6 +1,19 @@
 #!/usr/bin/env bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 docker run --runtime=nvidia -v $PWD:/workspace/bert \
     --rm --shm-size=1g --ulimit memlock=-1 \
     --ulimit stack=67108864 --ipc=host -t -i \
-    bert bash -c "bash scripts/data_download_helper.sh"
+    bert bash -c "bash data/create_datasets_from_start.sh"

+ 0 - 17
TensorFlow/LanguageModeling/BERT/scripts/data_download_helper.sh

@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-
-# Download pretrained_models
-cd /workspace/bert/data/pretrained_models_google && python3 download_models.py
-
-# Download SQUAD
-cd /workspace/bert/data/squad && . squad_download.sh
-
-# Download GLUE
-cd /workspace/bert/data/glue && python3 download_glue_data.py
-
-# WIKI Download, set config in data_generators/wikipedia_corpus/config.sh
-cd /workspace/bert/data/wikipedia_corpus && . run_preprocessing.sh
-
-cd /workspace/bert/data/bookcorpus && . run_preprocessing.sh
-
-cd /workspace/bert/data/glue && python3 download_glue_data.py 

+ 16 - 5
TensorFlow/LanguageModeling/BERT/scripts/finetune_inference_benchmark.sh

@@ -1,13 +1,26 @@
 #!/bin/bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 bert_model=${1:-"large"}
 use_xla=${2:-"true"}
 task=${3:-"squad"}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/pretrained_models_google/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
 else
-    export BERT_DIR=data/pretrained_models_google/uncased_L-12_H-768_A-12
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
 fi
 echo  "BERT directory set as " $BERT_DIR
 
@@ -31,7 +44,7 @@ echo "Results directory set as " $RESULTS_DIR
 LOGFILE="${RESULTS_DIR}/${task}_inference_benchmark_bert_${bert_model}.log"
 tmp_file="/tmp/${task}_inference_benchmark.log"
 if [ "$task" = "squad" ] ; then
-    export SQUAD_DIR=data/squad/v1.1
+    export SQUAD_DIR=data/download/squad/v1.1
 
     echo "Squad directory set as " $SQUAD_DIR
 
@@ -48,11 +61,9 @@ if [ "$task" = "squad" ] ; then
 
         if [ "$precision" = "fp16" ] ; then
             echo "fp16 activated!"
-            export TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE=1
             use_fp16="--use_fp16"
         else
             echo "fp32 activated!"
-            export TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE=0
             use_fp16=""
         fi
 

+ 19 - 15
TensorFlow/LanguageModeling/BERT/scripts/finetune_train_benchmark.sh

@@ -1,15 +1,27 @@
 #!/bin/bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 bert_model=${1:-"large"}
-precision=${2:-"fp16"}
-use_xla=${3:-"true"}
-num_gpu=${4:-"8"}
-task=${5:-"squad"}
+use_xla=${2:-"true"}
+num_gpu=${3:-"8"}
+task=${4:-"squad"}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/pretrained_models_google/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
 else
-    export BERT_DIR=data/pretrained_models_google/uncased_L-12_H-768_A-12
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
 fi
 
 echo  "BERT directory set as " $BERT_DIR
@@ -25,12 +37,6 @@ if [ ! -d "$RESULTS_DIR" ] ; then
 fi
 echo "Results directory set as " $RESULTS_DIR
 
-use_fp16=""
-if [ "$precision" = "fp16" ] ; then
-        export TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE=1
-        use_fp16="--use_fp16"
-fi
-
 
 if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--use_xla"
@@ -53,7 +59,7 @@ fi
 LOGFILE="${RESULTS_DIR}/${task}_training_benchmark_bert_${bert_model}_gpu_${num_gpu}.log"
 
 if [ "$task" = "squad" ] ; then
-    export SQUAD_DIR=data/squad/v1.1
+    export SQUAD_DIR=data/download/squad/v1.1
     epochs="2.0"
     echo "Squad directory set as " $SQUAD_DIR
 
@@ -76,11 +82,9 @@ if [ "$task" = "squad" ] ; then
 
                 if [ "$precision" = "fp16" ] ; then
                     echo "fp16 activated!"
-                    export TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE=1
                     use_fp16="--use_fp16"
                 else
                     echo "fp32 activated!"
-                    export TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE=0
                     use_fp16=""
                 fi
 

+ 51 - 45
TensorFlow/LanguageModeling/BERT/scripts/run_glue.sh

@@ -1,49 +1,47 @@
 #!/usr/bin/env bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
-batch_size=${1:-"32"}
-learning_rate=${2:-"2e-5"}
-precision=${3:-"fp16"}
-use_xla=${4:-"true"}
-num_gpu=${5:-"8"}
-seq_length=${6:-"128"}
-bert_model=${7:-"large"}
+task_name=${1:-"MRPC"}
+batch_size=${2:-"32"}
+learning_rate=${3:-"2e-5"}
+precision=${4:-"fp16"}
+use_xla=${5:-"true"}
+num_gpu=${6:-"8"}
+seq_length=${7:-"128"}
+doc_stride=${8:-"64"}
+bert_model=${9:-"large"}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/pretrained_models_google/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
 else
-    export BERT_DIR=data/pretrained_models_google/uncased_L-12_H-768_A-12
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
 fi
-export GLUE_DIR=data/glue
-
-epochs=${8:-"3.0"}
-ws=${9:-"0.1"}
-init_checkpoint=${10:-"$BERT_DIR/bert_model.ckpt"}
+export GLUE_DIR=data/download
 
-#Edit to save logs & checkpoints in a different directory
-RESULTS_DIR=/results
 
-if [ ! -d "$BERT_DIR" ] ; then
-   echo "Error! $BERT_DIR directory missing. Please mount pretrained BERT dataset."
-   exit -1
-fi
-if [ ! -d "$GLUE_DIR" ] ; then
-   echo "Error! $GLUE_DIR directory missing. Please mount SQuAD dataset."
-   exit -1
-fi
-if [ ! -d "$RESULTS_DIR" ] ; then
-   echo "Error! $RESULTS_DIR directory missing."
-   exit -1
-fi
+epochs=${10:-"3.0"}
+ws=${11:-"0.1"}
+init_checkpoint=${12:-"$BERT_DIR/bert_model.ckpt"}
 
 echo "GLUE directory set as " $GLUE_DIR " BERT directory set as " $BERT_DIR
-echo "Results directory set as " $RESULTS_DIR
 
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
         echo "fp16 activated!"
-        export TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE=1
         use_fp16="--use_fp16"
 fi
 
@@ -60,34 +58,42 @@ if [ $num_gpu -gt 1 ] ; then
     -x NCCL_DEBUG=INFO \
     -x LD_LIBRARY_PATH \
     -x PATH -mca pml ob1 -mca btl ^openib"
-    use_hvd="--horovod"
 else
     mpi_command=""
-    use_hvd=""
 fi
 
-  export GBS=$(expr $batch_size \* $num_gpu)
-  printf -v TAG "tf_bert_%s_glue_1n_%s_gbs%d" "$bert_model" "$precision" $GBS
-  DATESTAMP=`date +'%y%m%d%H%M%S'`
-  RESULTS_DIR=${RESULTS_DIR}/${TAG}_${DATESTAMP}
-  mkdir $RESULTS_DIR
-  LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
-  printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
-  printf "Writing logs to %s\n" "$LOGFILE"
+export GBS=$(expr $batch_size \* $num_gpu)
+printf -v TAG "tf_bert_finetuning_glue_%s_%s_%s_gbs%d" "$task_name" "$bert_model" "$precision" $GBS
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+#Edit to save logs & checkpoints in a different directory
+RESULTS_DIR=/results/${TAG}_${DATESTAMP}
+LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
+mkdir -m 777 -p $RESULTS_DIR
+printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
+printf "Logs written to %s\n" "$LOGFILE"
+
+#Check if all necessary files are available before training
+for DIR_or_file in $GLUE_DIR/${task_name} $RESULTS_DIR $BERT_DIR/vocab.txt $BERT_DIR/bert_config.json; do
+  echo $DIR_or_file
+  if [ ! -d "$DIR_or_file" ] && [ ! -f "$DIR_or_file" ]; then
+     echo "Error! $DIR_or_file directory missing. Please mount correctly"
+     exit -1
+  fi
+done
 
 $mpi_command python run_classifier.py \
-  --task_name=MRPC \
+  --task_name=$task_name \
   --do_train=true \
   --do_eval=true \
-  --data_dir=$GLUE_DIR/MRPC \
+  --data_dir=$GLUE_DIR/$task_name \
   --vocab_file=$BERT_DIR/vocab.txt \
   --bert_config_file=$BERT_DIR/bert_config.json \
   --init_checkpoint=$init_checkpoint \
   --max_seq_length=$seq_length \
+  --doc_stride=$doc_stride \
   --train_batch_size=$batch_size \
   --learning_rate=$learning_rate \
   --num_train_epochs=$epochs \
   --output_dir=$RESULTS_DIR \
-    "$use_hvd" \
-    "$use_fp16" \
-    $use_xla_tag --warmup_proportion=$ws |& tee $LOGFILE
+  --horovod "$use_fp16" \
+  $use_xla_tag --warmup_proportion=$ws |& tee $LOGFILE

+ 78 - 0
TensorFlow/LanguageModeling/BERT/scripts/run_glue_inference.sh

@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+task_name=${1:-"MRPC"}
+init_checkpoint=${2:-"$BERT_DIR/bert_model.ckpt"}
+batch_size=${3:-"32"}
+precision=${4:-"fp16"}
+use_xla=${5:-"true"}
+seq_length=${6:-"128"}
+doc_stride=${7:-"64"}
+bert_model=${8:-"large"}
+
+if [ "$bert_model" = "large" ] ; then
+    BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+else
+    BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
+fi
+GLUE_DIR=data/download
+
+echo "GLUE directory set as " $GLUE_DIR " BERT directory set as " $BERT_DIR
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+        echo "fp16 activated!"
+        use_fp16="--use_fp16"
+fi
+
+if [ "$use_xla" = "true" ] ; then
+    use_xla_tag="--use_xla"
+    echo "XLA activated"
+else
+    use_xla_tag=""
+fi
+
+
+export GBS=$(expr $batch_size \* $num_gpu)
+printf -v TAG "tf_bert_finetuning_glue_%s_inf_%s_%s_gbs%d_ckpt_%s" "$task_name" "$bert_model" "$precision" $GBS "$init_checkpoint"
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+#Edit to save logs & checkpoints in a different directory
+RESULTS_DIR=/results
+LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
+printf "Logs written to %s\n" "$LOGFILE"
+
+#Check if all necessary files are available before training
+for DIR_or_file in $GLUE_DIR $RESULTS_DIR $BERT_DIR/vocab.txt $BERT_DIR/bert_config.json; do
+  if [ ! -d "$DIR_or_file" ] && [ ! -f "$DIR_or_file" ]; then
+     echo "Error! $DIR_or_file directory missing. Please mount correctly"
+     exit -1
+  fi
+done
+
+$mpi_command python run_classifier.py \
+  --task_name=$task_name \
+  --predict_batch_size=$batch_size \
+  --eval_batch_size=$batch_size \
+  --do_eval=true \
+  --data_dir=$GLUE_DIR/$task_name \
+  --vocab_file=$BERT_DIR/vocab.txt \
+  --bert_config_file=$BERT_DIR/bert_config.json \
+  --init_checkpoint=$init_checkpoint \
+  --max_seq_length=$seq_length \
+  --doc_stride=$doc_stride \
+  --output_dir=$RESULTS_DIR \
+  --horovod "$use_fp16" \
+  $use_xla_tag |& tee $LOGFILE

+ 0 - 102
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining.sh

@@ -1,102 +0,0 @@
-#! /bin/bash
-
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
-
-WIKI_DIR=/workspace/bert/data/wikipedia_corpus/final_tfrecords_sharded
-BOOKS_DIR=/workspace/bert/data/bookcorpus/final_tfrecords_sharded
-BERT_CONFIG=/workspace/bert/data/pretrained_models_google/uncased_L-24_H-1024_A-16/bert_config.json
-
-#Edit to save logs & checkpoints in a different directory
-RESULTS_DIR=/results
-
-if [ ! -d "$WIKI_DIR" ] ; then
-   echo "Error! $WIKI_DIR directory missing. Please mount wikipedia dataset."
-   exit -1
-else
-   SOURCES="$WIKI_DIR/*"
-fi
-if [ ! -d "$BOOKS_DIR" ] ; then
-   echo "Warning! $BOOKS_DIR directory missing. Training will proceed without book corpus."
-else
-   SOURCES+=" $BOOKS_DIR/*"
-fi
-if [ ! -d "$RESULTS_DIR" ] ; then
-   echo "Error! $RESULTS_DIR directory missing."
-   exit -1
-fi
-
-if [ ! -f "$BERT_CONFIG" ] ; then
-   echo "Error! BERT large configuration file not found at $BERT_CONFIG"
-   exit -1
-fi
-
-train_batch_size=${1:-14}
-eval_batch_size=${2:-8}
-learning_rate=${3:-"1e-4"}
-precision=${4:-"manual_fp16"}
-use_xla=${5:-"true"}
-num_gpus=${6:-1}
-warmup_steps=${7:-"10000"}
-train_steps=${8:-1144000}
-save_checkpoints_steps=${9:-5000}
-
-PREC=""
-if [ "$precision" = "fp16" ] ; then
-   PREC="--use_fp16"
-elif [ "$precision" = "fp32" ] ; then
-   PREC=""
-elif [ "$precision" = "manual_fp16" ] ; then
-   PREC="--manual_fp16"
-else
-   echo "Unknown <precision> argument"
-   exit -2
-fi
-
-if [ "$use_xla" = "true" ] ; then
-    PREC="$PREC --use_xla"
-    echo "XLA activated"
-fi
-
-export GBS=$(expr $train_batch_size \* $num_gpus)
-printf -v TAG "tf_bert_pretraining_%s_gbs%d" "$precision" $GBS
-DATESTAMP=`date +'%y%m%d%H%M%S'`
-RESULTS_DIR=${RESULTS_DIR}/${TAG}_${DATESTAMP}
-LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
-printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
-printf "Logs written to %s\n" "$LOGFILE"
-
-echo $SOURCES
-INPUT_FILES=$(eval ls $SOURCES | tr " " "\n" | awk '{printf "%s,",$1}' | sed s'/.$//')
-CMD="python3 /workspace/bert/run_pretraining.py"
-CMD+=" --input_file=$INPUT_FILES"
-CMD+=" --output_dir=$RESULTS_DIR"
-CMD+=" --bert_config_file=$BERT_CONFIG"
-CMD+=" --do_train=True"
-CMD+=" --do_eval=True"
-CMD+=" --train_batch_size=$train_batch_size"
-CMD+=" --eval_batch_size=$eval_batch_size"
-CMD+=" --max_seq_length=512"
-CMD+=" --max_predictions_per_seq=80"
-CMD+=" --num_train_steps=$train_steps"
-CMD+=" --num_warmup_steps=$warmup_steps"
-CMD+=" --save_checkpoints_steps=$save_checkpoints_steps"
-CMD+=" --learning_rate=$learning_rate"
-CMD+=" --report_loss"
-CMD+=" --horovod $PREC"
-
-if [ $num_gpus -gt 1 ] ; then
-   CMD="mpiexec --allow-run-as-root -np $num_gpus --bind-to socket $CMD"
-fi
-
-
-
-
-set -x
-if [ -z "$LOGFILE" ] ; then
-   $CMD
-else
-   (
-     $CMD
-   ) |& tee $LOGFILE
-fi
-set +x

+ 111 - 0
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_adam.sh

@@ -0,0 +1,111 @@
+#! /bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+train_batch_size=${1:-14}
+eval_batch_size=${2:-8}
+learning_rate=${3:-"1e-4"}
+precision=${4:-"manual_fp16"}
+use_xla=${5:-"true"}
+num_gpus=${6:-8}
+warmup_steps=${7:-"10000"}
+train_steps=${8:-1144000}
+save_checkpoints_steps=${9:-5000}
+bert_model=${10:-"large"}
+num_accumulation_steps=${11:-1}
+seq_len=${12:-512}
+max_pred_per_seq=${13:-80}
+
+DATA_DIR=data/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pred_per_seq}_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus
+
+if [ "$bert_model" = "large" ] ; then
+    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json
+else
+    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_config.json
+fi
+
+PREC=""
+if [ "$precision" = "fp16" ] ; then
+   PREC="--use_fp16"
+elif [ "$precision" = "fp32" ] ; then
+   PREC=""
+elif [ "$precision" = "manual_fp16" ] ; then
+   PREC="--manual_fp16"
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+if [ "$use_xla" = "true" ] ; then
+    PREC="$PREC --use_xla"
+    echo "XLA activated"
+fi
+
+export GBS=$(expr $train_batch_size \* $num_gpus \* $num_accumulation_steps)
+printf -v TAG "tf_bert_pretraining_adam_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+
+#Edit to save logs & checkpoints in a different directory
+RESULTS_DIR=${RESULTS_DIR:-/results/${TAG}_${DATESTAMP}}
+LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
+mkdir -m 777 -p $RESULTS_DIR
+printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
+printf "Logs written to %s\n" "$LOGFILE"
+
+INPUT_FILES="$DATA_DIR/training"
+EVAL_FILES="$DATA_DIR/test"
+
+CMD="python3 /workspace/bert/run_pretraining.py"
+CMD+=" --input_files_dir=$INPUT_FILES"
+CMD+=" --eval_files_dir=$EVAL_FILES"
+CMD+=" --output_dir=$RESULTS_DIR"
+CMD+=" --bert_config_file=$BERT_CONFIG"
+CMD+=" --do_train=True"
+CMD+=" --do_eval=True"
+CMD+=" --train_batch_size=$train_batch_size"
+CMD+=" --eval_batch_size=$eval_batch_size"
+CMD+=" --max_seq_length=$seq_len"
+CMD+=" --max_predictions_per_seq=$max_pred_per_seq"
+CMD+=" --num_train_steps=$train_steps"
+CMD+=" --num_warmup_steps=$warmup_steps"
+CMD+=" --num_accumulation_steps=$num_accumulation_steps"
+CMD+=" --save_checkpoints_steps=$save_checkpoints_steps"
+CMD+=" --learning_rate=$learning_rate"
+CMD+=" --optimizer_type=adam"
+CMD+=" --horovod $PREC"
+CMD+=" --allreduce_post_accumulation=True"
+
+#Check if all necessary files are available before training
+for DIR_or_file in $DATA_DIR $BERT_CONFIG $RESULTS_DIR; do
+  if [ ! -d "$DIR_or_file" ] && [ ! -f "$DIR_or_file" ]; then
+     echo "Error! $DIR_or_file directory missing. Please mount correctly"
+     exit -1
+  fi
+done
+
+if [ $num_gpus -gt 1 ] ; then
+   CMD="mpiexec --allow-run-as-root -np $num_gpus --bind-to socket $CMD"
+fi
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee $LOGFILE
+fi
+set +x

+ 60 - 0
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb.sh

@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+train_batch_size_phase1=${1:-64}
+train_batch_size_phase2=${2:-8}
+eval_batch_size=${3:-8}
+learning_rate_phase1=${4:-"7.5e-4"}
+learning_rate_phase2=${5:-"5e-4"}
+precision=${6:-"fp16"}
+use_xla=${7:-"true"}
+num_gpus=${8:-8}
+warmup_steps_phase1=${9:-"2000"}
+warmup_steps_phase2=${10:-"200"}
+train_steps=${11:-7820}
+save_checkpoints_steps=${12:-100}
+num_accumulation_steps_phase1=${13:-128}
+num_accumulation_steps_phase2=${14:-512}
+bert_model=${15:-"large"}
+
+DATA_DIR=data
+export DATA_DIR=$DATA_DIR
+
+GBS1=$(expr $train_batch_size_phase1 \* $num_gpus \* $num_accumulation_steps_phase1)
+GBS2=$(expr $train_batch_size_phase2 \* $num_gpus \* $num_accumulation_steps_phase2)
+printf -v TAG "tf_bert_pretraining_lamb_%s_%s_gbs1%d_gbs2%d" "$bert_model" "$precision" $GBS1 $GBS2
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+
+#Edit to save logs & checkpoints in a different directory
+RESULTS_DIR=${RESULTS_DIR:-/results/${TAG}_${DATESTAMP}}
+LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
+mkdir -m 777 -p $RESULTS_DIR
+printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
+printf "Logs written to %s\n" "$LOGFILE"
+export RESULTS_DIR=$RESULTS_DIR
+
+printf -v SCRIPT_ARGS "%d %d %d %e %e %s %s %d %d %d %d %d %d %d %s %s" \
+                      $train_batch_size_phase1 $train_batch_size_phase2 $eval_batch_size $learning_rate_phase1 \
+                      $learning_rate_phase2 "$precision" "$use_xla" $num_gpus $warmup_steps_phase1 \
+                      $warmup_steps_phase2 $train_steps $save_checkpoints_steps \
+                      $num_accumulation_steps_phase1 $num_accumulation_steps_phase2 "$bert_model"
+
+# RUN PHASE 1
+bash scripts/run_pretraining_lamb_phase1.sh $SCRIPT_ARGS |& tee -a $LOGFILE
+
+# RUN PHASE 2
+bash scripts/run_pretraining_lamb_phase2.sh $SCRIPT_ARGS |& tee -a $LOGFILE

+ 103 - 0
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase1.sh

@@ -0,0 +1,103 @@
+#! /bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+train_batch_size_phase1=${1:-64}
+train_batch_size_phase2=${2:-8}
+eval_batch_size=${3:-8}
+learning_rate_phase1=${4:-"7.5e-4"}
+learning_rate_phase2=${5:-"5e-4"}
+precision=${6:-"fp16"}
+use_xla=${7:-"true"}
+num_gpus=${8:-2}
+warmup_steps_phase1=${9:-"2000"}
+warmup_steps_phase2=${10:-"200"}
+train_steps=${11:-7820}
+save_checkpoints_steps=${12:-100}
+num_accumulation_steps_phase1=${13:-128}
+num_accumulation_steps_phase2=${14:-512}
+bert_model=${15:-"large"}
+
+DATA_DIR=${DATA_DIR:-data}
+#Edit to save logs & checkpoints in a different directory
+RESULTS_DIR=${RESULTS_DIR:-/results}
+
+if [ "$bert_model" = "large" ] ; then
+    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json
+else
+    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_config.json
+fi
+
+PREC=""
+if [ "$precision" = "fp16" ] ; then
+   PREC="--use_fp16"
+elif [ "$precision" = "fp32" ] ; then
+   PREC=""
+elif [ "$precision" = "manual_fp16" ] ; then
+   PREC="--manual_fp16"
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+if [ "$use_xla" = "true" ] ; then
+    PREC="$PREC --use_xla"
+    echo "XLA activated"
+fi
+
+mpi=""
+if [ $num_gpus -gt 1 ] ; then
+   mpi="mpiexec --allow-run-as-root -np $num_gpus --bind-to socket"
+fi
+
+#PHASE 1
+
+train_steps_phase1=$(expr $train_steps \* 9 \/ 10) #Phase 1 is 10% of training
+gbs_phase1=$(expr $train_batch_size_phase1 \* $num_accumulation_steps_phase1)
+seq_len=128
+max_pred_per_seq=20
+RESULTS_DIR_PHASE1=${RESULTS_DIR}/phase_1
+mkdir -m 777 -p $RESULTS_DIR_PHASE1
+
+INPUT_FILES="$DATA_DIR/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pred_per_seq}_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training"
+EVAL_FILES="$DATA_DIR/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pred_per_seq}_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/test"
+
+#Check if all necessary files are available before training
+for DIR_or_file in $DATA_DIR $RESULTS_DIR_PHASE1 $BERT_CONFIG; do
+  if [ ! -d "$DIR_or_file" ] && [ ! -f "$DIR_or_file" ]; then
+     echo "Error! $DIR_or_file directory missing. Please mount correctly"
+     exit -1
+  fi
+done
+
+ $mpi python /workspace/bert/run_pretraining.py \
+     --input_files_dir=$INPUT_FILES \
+     --eval_files_dir=$EVAL_FILES \
+     --output_dir=$RESULTS_DIR_PHASE1 \
+     --bert_config_file=$BERT_CONFIG \
+     --do_train=True \
+     --do_eval=True \
+     --train_batch_size=$train_batch_size_phase1 \
+     --eval_batch_size=$eval_batch_size \
+     --max_seq_length=$seq_len \
+     --max_predictions_per_seq=$max_pred_per_seq \
+     --num_train_steps=$train_steps_phase1 \
+     --num_accumulation_steps=$num_accumulation_steps_phase1 \
+     --num_warmup_steps=$warmup_steps_phase1 \
+     --save_checkpoints_steps=$save_checkpoints_steps \
+     --learning_rate=$learning_rate_phase1 \
+     --horovod $PREC \
+     --allreduce_post_accumulation=True

+ 115 - 0
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase2.sh

@@ -0,0 +1,115 @@
+#! /bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+train_batch_size_phase1=${1:-64}
+train_batch_size_phase2=${2:-8}
+eval_batch_size=${3:-8}
+learning_rate_phase1=${4:-"7.5e-4"}
+learning_rate_phase2=${5:-"5e-4"}
+precision=${6:-"fp16"}
+use_xla=${7:-"true"}
+num_gpus=${8:-2}
+warmup_steps_phase1=${9:-"2000"}
+warmup_steps_phase2=${10:-"200"}
+train_steps=${11:-7820}
+save_checkpoints_steps=${12:-100}
+num_accumulation_steps_phase1=${13:-128}
+num_accumulation_steps_phase2=${14:-512}
+bert_model=${15:-"large"}
+
+DATA_DIR=${DATA_DIR:-data}
+#Edit to save logs & checkpoints in a different directory
+RESULTS_DIR=${RESULTS_DIR:-/results}
+
+if [ "$bert_model" = "large" ] ; then
+    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json
+else
+    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_config.json
+fi
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+PREC=""
+if [ "$precision" = "fp16" ] ; then
+   PREC="--use_fp16"
+elif [ "$precision" = "fp32" ] ; then
+   PREC=""
+elif [ "$precision" = "manual_fp16" ] ; then
+   PREC="--manual_fp16"
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+if [ "$use_xla" = "true" ] ; then
+    PREC="$PREC --use_xla"
+    echo "XLA activated"
+fi
+
+mpi=""
+if [ $num_gpus -gt 1 ] ; then
+   mpi="mpiexec --allow-run-as-root -np $num_gpus --bind-to socket"
+fi
+
+#PHASE 1 Config
+
+train_steps_phase1=$(expr $train_steps \* 9 \/ 10) #Phase 1 is 10% of training
+gbs_phase1=$(expr $train_batch_size_phase1 \* $num_accumulation_steps_phase1)
+PHASE1_CKPT=${RESULTS_DIR}/phase_1/model.ckpt-${train_steps_phase1}
+
+#PHASE 2
+
+seq_len=512
+max_pred_per_seq=80
+train_steps_phase2=$(expr $train_steps \* 1 \/ 10) #Phase 2 is 10% of training
+gbs_phase2=$(expr $train_batch_size_phase2 \* $num_accumulation_steps_phase2)
+train_steps_phase2=$(expr $train_steps_phase2 \* $gbs_phase1 \/ $gbs_phase2) # Adjust for batch size
+
+RESULTS_DIR_PHASE2=${RESULTS_DIR}/phase_2
+mkdir -m 777 -p $RESULTS_DIR_PHASE2
+
+INPUT_FILES="$DATA_DIR/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pred_per_seq}_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training"
+EVAL_FILES="$DATA_DIR/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pred_per_seq}_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/test"
+
+#Check if all necessary files are available before training
+for DIR_or_file in $DATA_DIR $RESULTS_DIR $BERT_CONFIG ${PHASE1_CKPT}.meta; do
+  if [ ! -d "$DIR_or_file" ] && [ ! -f "$DIR_or_file" ]; then
+     echo "Error! $DIR_or_file directory missing. Please mount correctly"
+     exit -1
+  fi
+done
+
+$mpi python /workspace/bert/run_pretraining.py \
+    --input_files_dir=$INPUT_FILES \
+    --init_checkpoint=$PHASE1_CKPT \
+    --eval_files_dir=$EVAL_FILES \
+    --output_dir=$RESULTS_DIR_PHASE2 \
+    --bert_config_file=$BERT_CONFIG \
+    --do_train=True \
+    --do_eval=True \
+    --train_batch_size=$train_batch_size_phase2 \
+    --eval_batch_size=$eval_batch_size \
+    --max_seq_length=$seq_len \
+    --max_predictions_per_seq=$max_pred_per_seq \
+    --num_train_steps=$train_steps_phase2 \
+    --num_accumulation_steps=$num_accumulation_steps_phase2 \
+    --num_warmup_steps=$warmup_steps_phase2 \
+    --save_checkpoints_steps=$save_checkpoints_steps \
+    --learning_rate=$learning_rate_phase2 \
+    --horovod $PREC \
+    --allreduce_post_accumulation=True
+

+ 51 - 51
TensorFlow/LanguageModeling/BERT/scripts/run_squad.sh

@@ -1,5 +1,18 @@
 #!/usr/bin/env bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
 batch_size=${1:-"8"}
@@ -12,14 +25,14 @@ doc_stride=${7:-"128"}
 bert_model=${8:-"large"}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/pretrained_models_google/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
 else
-    export BERT_DIR=data/pretrained_models_google/uncased_L-12_H-768_A-12
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
 fi
 
 squad_version=${9:-"1.1"}
 
-export SQUAD_DIR=data/squad/v${squad_version}
+export SQUAD_DIR=data/download/squad/v${squad_version}
 if [ "$squad_version" = "1.1" ] ; then
     version_2_with_negative="False"
 else
@@ -29,29 +42,11 @@ fi
 init_checkpoint=${10:-"$BERT_DIR/bert_model.ckpt"}
 epochs=${11:-"2.0"}
 
-#Edit to save logs & checkpoints in a different directory
-RESULTS_DIR=/results
-
-if [ ! -d "$SQUAD_DIR" ] ; then
-   echo "Error! $SQUAD_DIR directory missing. Please mount SQuAD dataset."
-   exit -1
-fi
-if [ ! -d "$BERT_DIR" ] ; then
-   echo "Error! $BERT_DIR directory missing. Please mount pretrained BERT dataset."
-   exit -1
-fi
-if [ ! -d "$RESULTS_DIR" ] ; then
-   echo "Error! $RESULTS_DIR directory missing."
-   exit -1
-fi
-
 echo "Squad directory set as " $SQUAD_DIR " BERT directory set as " $BERT_DIR
-echo "Results directory set as " $RESULTS_DIR
 
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
         echo "fp16 activated!"
-        export TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE=1
         use_fp16="--use_fp16"
 fi
 
@@ -68,40 +63,45 @@ if [ $num_gpu -gt 1 ] ; then
     -x NCCL_DEBUG=INFO \
     -x LD_LIBRARY_PATH \
     -x PATH -mca pml ob1 -mca btl ^openib"
-    use_hvd="--horovod"
 else
     mpi_command=""
-    use_hvd=""
 fi
 
+export GBS=$(expr $batch_size \* $num_gpu)
+printf -v TAG "tf_bert_finetuning_squad_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
+DATESTAMP=`date +'%y%m%d%H%M%S'`
 
-  export GBS=$(expr $batch_size \* $num_gpu)
-  printf -v TAG "tf_bert_%s_squad_1n_%s_gbs%d" "$bert_model" "$precision" $GBS
-  DATESTAMP=`date +'%y%m%d%H%M%S'`
-
-  RESULTS_DIR=${RESULTS_DIR}/${TAG}_${DATESTAMP}
-  mkdir $RESULTS_DIR
-  LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
-  printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
-  printf "Writing logs to %s\n" "$LOGFILE"
-
-    $mpi_command python run_squad.py \
-    --vocab_file=$BERT_DIR/vocab.txt \
-    --bert_config_file=$BERT_DIR/bert_config.json \
-    --init_checkpoint=$init_checkpoint \
-    --do_train=True \
-    --train_file=$SQUAD_DIR/train-v${squad_version}.json \
-    --do_predict=True \
-    --predict_file=$SQUAD_DIR/dev-v${squad_version}.json \
-    --train_batch_size=$batch_size \
-    --learning_rate=$learning_rate \
-    --num_train_epochs=$epochs \
-    --max_seq_length=$seq_length \
-    --doc_stride=$doc_stride \
-    --save_checkpoints_steps 1000 \
-    --output_dir=$RESULTS_DIR \
-    "$use_hvd" \
-    "$use_fp16" \
-    $use_xla_tag --version_2_with_negative=${version_2_with_negative} |& tee $LOGFILE
+#Edit to save logs & checkpoints in a different directory
+RESULTS_DIR=/results/${TAG}_${DATESTAMP}
+LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
+mkdir -m 777 -p $RESULTS_DIR
+printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
+printf "Logs written to %s\n" "$LOGFILE"
+
+#Check if all necessary files are available before training
+for DIR_or_file in $SQUAD_DIR $RESULTS_DIR $BERT_DIR/bert_config.json $BERT_DIR/vocab.txt; do
+  if [ ! -d "$DIR_or_file" ] && [ ! -f "$DIR_or_file" ]; then
+     echo "Error! $DIR_or_file directory missing. Please mount correctly"
+     exit -1
+  fi
+done
+
+$mpi_command python run_squad.py \
+--vocab_file=$BERT_DIR/vocab.txt \
+--bert_config_file=$BERT_DIR/bert_config.json \
+--init_checkpoint=$init_checkpoint \
+--do_train=True \
+--train_file=$SQUAD_DIR/train-v${squad_version}.json \
+--do_predict=True \
+--predict_file=$SQUAD_DIR/dev-v${squad_version}.json \
+--train_batch_size=$batch_size \
+--learning_rate=$learning_rate \
+--num_train_epochs=$epochs \
+--max_seq_length=$seq_length \
+--doc_stride=$doc_stride \
+--save_checkpoints_steps 1000 \
+--output_dir=$RESULTS_DIR \
+--horovod "$use_fp16" \
+$use_xla_tag --version_2_with_negative=${version_2_with_negative} |& tee $LOGFILE
 
 python $SQUAD_DIR/evaluate-v${squad_version}.py $SQUAD_DIR/dev-v${squad_version}.json ${RESULTS_DIR}/predictions.json |& tee -a $LOGFILE

+ 31 - 23
TensorFlow/LanguageModeling/BERT/scripts/run_squad_inference.sh

@@ -1,5 +1,18 @@
 #!/usr/bin/env bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
 init_checkpoint=${1:-"/results/model.ckpt"}
@@ -12,33 +25,18 @@ bert_model=${7:-"large"}
 squad_version=${8:-"1.1"}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/pretrained_models_google/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
 else
-    export BERT_DIR=data/pretrained_models_google/uncased_L-12_H-768_A-12
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
 fi
 
-export SQUAD_DIR=data/squad/v${squad_version}
+export SQUAD_DIR=data/download/squad/v${squad_version}
 if [ "$squad_version" = "1.1" ] ; then
     version_2_with_negative="False"
 else
     version_2_with_negative="True"
 fi
 
-#Edit to save logs & checkpoints in a different directory
-RESULTS_DIR=/results
-
-if [ ! -d "$SQUAD_DIR" ] ; then
-   echo "Error! $SQUAD_DIR directory missing. Please mount SQuAD dataset."
-   exit -1
-fi
-if [ ! -d "$BERT_DIR" ] ; then
-   echo "Error! $BERT_DIR directory missing. Please mount pretrained BERT dataset."
-   exit -1
-fi
-if [ ! -d "$RESULTS_DIR" ] ; then
-   echo "Error! $RESULTS_DIR directory missing."
-   exit -1
-fi
 
 echo "Squad directory set as " $SQUAD_DIR " BERT directory set as " $BERT_DIR
 echo "Results directory set as " $RESULTS_DIR
@@ -46,7 +44,6 @@ echo "Results directory set as " $RESULTS_DIR
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
         echo "fp16 activated!"
-        export TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE=1
         use_fp16="--use_fp16"
 fi
 
@@ -57,10 +54,20 @@ else
     use_xla_tag=""
 fi
 
-  printf -v TAG "tf_bert_%s_squad_inf_1n_%s_gbs%d_ckpt_%s" "$bert_model" "$precision" $batch_size "$init_checkpoint"
-  DATESTAMP=`date +'%y%m%d%H%M%S'`
-  LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
-  printf "Writing logs to %s\n" "$LOGFILE"
+printf -v TAG "tf_bert_finetuning_squad_%s_inf_%s_gbs%d_ckpt_%s" "$bert_model" "$precision" $batch_size "$init_checkpoint"
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+#Edit to save logs & checkpoints in a different directory
+RESULTS_DIR=/results
+LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
+printf "Logs written to %s\n" "$LOGFILE"
+
+#Check if all necessary files are available before training
+for DIR_or_file in $SQUAD_DIR $RESULTS_DIR $BERT_DIR/vocab.txt $BERT_DIR/bert_config.json; do
+  if [ ! -d "$DIR_or_file" ] && [ ! -f "$DIR_or_file" ]; then
+     echo "Error! $DIR_or_file directory missing. Please mount correctly"
+     exit -1
+  fi
+done
 
 python run_squad.py \
 --vocab_file=$BERT_DIR/vocab.txt \
@@ -68,6 +75,7 @@ python run_squad.py \
 --init_checkpoint=$init_checkpoint \
 --do_predict=True \
 --predict_file=$SQUAD_DIR/dev-v${squad_version}.json \
+--predict_batch_size=$batch_size \
 --max_seq_length=$seq_length \
 --doc_stride=$doc_stride \
 --predict_batch_size=$batch_size \

+ 17 - 3
TensorFlow/LanguageModeling/BERT/scripts/trtis/export_model.sh

@@ -1,10 +1,25 @@
-init_checkpoint=${1:-"/results/model.ckpt"}
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+init_checkpoint=${1:-"/results/models/bert_large_fp16_384_v1/model.ckpt-5474"}
 batch_size=${2:-"8"}
 precision=${3:-"fp16"}
 use_xla=${4:-"true"}
 seq_length=${5:-"384"}
 doc_stride=${6:-"128"}
-BERT_DIR=${7:-"data/pretrained_models_google/uncased_L-24_H-1024_A-16"}
+BERT_DIR=${7:-"data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16"}
 trtis_model_version=${8:-1}
 trtis_model_name=${9:-"bert"}
 trtis_dyn_batching_delay=${10:-0}
@@ -17,7 +32,6 @@ additional_args="--trtis_model_version=$trtis_model_version --trtis_model_name=$
 
 if [ "$precision" = "fp16" ] ; then
    echo "fp16 activated!"
-   export TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE=1
    additional_args="$additional_args --use_fp16"
 fi
 

+ 17 - 2
TensorFlow/LanguageModeling/BERT/scripts/trtis/generate_figures.sh

@@ -1,3 +1,18 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Set the number of devices to use
 export NVIDIA_VISIBLE_DEVICES=0
 
@@ -12,9 +27,9 @@ init_checkpoint=${4:-"/results/models/bert_tf_${bert_model}_${precision}_${seq_l
 MODEL_NAME="bert_${bert_model}_${seq_length}_${precision}"
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/pretrained_models_google/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
 else
-    export BERT_DIR=data/pretrained_models_google/uncased_L-12_H-768_A-12
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
 fi
 
 doc_stride=128

+ 17 - 2
TensorFlow/LanguageModeling/BERT/scripts/trtis/run_client.sh

@@ -1,12 +1,27 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 batch_size=${1:-"8"}
 seq_length=${2:-"384"}
 doc_stride=${3:-"128"}
 trtis_version_name=${4:-"1"}
 trtis_model_name=${5:-"bert"}
-BERT_DIR=${6:-"data/pretrained_models_google/uncased_L-24_H-1024_A-16"}
+BERT_DIR=${6:-"data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16"}
 squad_version=${7:-"1.1"}
 
-export SQUAD_DIR=data/squad/v${squad_version}
+export SQUAD_DIR=data/download/squad/v${squad_version}
 if [ "$squad_version" = "1.1" ] ; then
     version_2_with_negative="False"
 else

+ 13 - 1
TensorFlow/LanguageModeling/BERT/scripts/trtis/run_perf_client.sh

@@ -1,6 +1,18 @@
-
 #!/bin/bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 MODEL_NAME=${1:-"bert"}
 MODEL_VERSION=${2:-1}
 precision=${3:-"fp16"}

+ 18 - 3
TensorFlow/LanguageModeling/BERT/scripts/trtis/run_trtis.sh

@@ -1,4 +1,19 @@
-init_checkpoint=${1:-"/results/model.ckpt"}
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+init_checkpoint=${1:-"/results/models/bert_large_fp16_384_v1/model.ckpt-5474"}
 batch_size=${2:-"8"}
 precision=${3:-"fp16"}
 use_xla=${4:-"true"}
@@ -14,9 +29,9 @@ trtis_engine_count=${13:-1}
 trtis_model_overwrite=${14:-"False"}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/pretrained_models_google/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
 else
-    export BERT_DIR=data/pretrained_models_google/uncased_L-12_H-768_A-12
+    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
 fi
 
 if [ ! -d "$BERT_DIR" ] ; then

+ 13 - 0
TensorFlow/LanguageModeling/BERT/scripts/trtis/wait_for_trtis_server.sh

@@ -1,5 +1,18 @@
 #!/bin/bash
 
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 SERVER_URI=${1:-"localhost"}
 
 echo "Waiting for TRTIS Server to be ready at http://$SERVER_URI:8000..."

+ 286 - 234
TensorFlow/LanguageModeling/BERT/tokenization.py

@@ -1,5 +1,6 @@
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Tokenization classes."""
 
 from __future__ import absolute_import
@@ -23,6 +25,18 @@ import unicodedata
 import six
 import tensorflow as tf
 import re
+import os
+
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+}
 
 def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
   """Checks whether the casing config is consistent with the checkpoint name."""
@@ -76,61 +90,41 @@ def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
 
 
 def convert_to_unicode(text):
-  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-  if six.PY3:
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
     if isinstance(text, str):
-      return text
+        return text
     elif isinstance(text, bytes):
-      return text.decode("utf-8", "ignore")
+        return text.decode("utf-8", "ignore")
     else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  elif six.PY2:
-    if isinstance(text, str):
-      return text.decode("utf-8", "ignore")
-    elif isinstance(text, unicode):
-      return text
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  else:
-    raise ValueError("Not running on Python2 or Python 3?")
+        raise ValueError("Unsupported string type: %s" % (type(text)))
 
 
 def printable_text(text):
-  """Returns text encoded in a way suitable for print or `tf.logging`."""
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
 
-  # These functions want `str` for both Python2 and Python3, but in one case
-  # it's a Unicode string and in the other it's a byte string.
-  if six.PY3:
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
     if isinstance(text, str):
-      return text
+        return text
     elif isinstance(text, bytes):
-      return text.decode("utf-8", "ignore")
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  elif six.PY2:
-    if isinstance(text, str):
-      return text
-    elif isinstance(text, unicode):
-      return text.encode("utf-8")
+        return text.decode("utf-8", "ignore")
     else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  else:
-    raise ValueError("Not running on Python2 or Python 3?")
+        raise ValueError("Unsupported string type: %s" % (type(text)))
 
 
 def load_vocab(vocab_file):
-  """Loads a vocabulary file into a dictionary."""
-  vocab = collections.OrderedDict()
-  index = 0
-  with tf.gfile.GFile(vocab_file, "r") as reader:
-    while True:
-      token = convert_to_unicode(reader.readline())
-      if not token:
-        break
-      token = token.strip()
-      vocab[token] = index
-      index += 1
-  return vocab
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
 
 
 def convert_by_vocab(vocab, items):
@@ -141,21 +135,13 @@ def convert_by_vocab(vocab, items):
   return output
 
 
-def convert_tokens_to_ids(vocab, tokens):
-  return convert_by_vocab(vocab, tokens)
-
-
-def convert_ids_to_tokens(inv_vocab, ids):
-  return convert_by_vocab(inv_vocab, ids)
-
-
 def whitespace_tokenize(text):
-  """Runs basic whitespace cleaning and splitting on a piece of text."""
-  text = text.strip()
-  if not text:
-    return []
-  tokens = text.split()
-  return tokens
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
 
 
 class FullTokenizer(object):
@@ -182,131 +168,197 @@ class FullTokenizer(object):
     return convert_by_vocab(self.inv_vocab, ids)
 
 
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, do_lower_case=True):
+        """
+    Instantiate a PreTrainedBertModel from a pre-trained model file.
+    Download and cache the pre-trained model file if needed.
+    """
+        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
+        else:
+            vocab_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file)
+            if resolved_vocab_file == vocab_file:
+
+                logger.info("loading vocabulary file {}".format(vocab_file))
+            else:
+                logger.info("loading vocabulary file {} from cache at {}".format(
+                    vocab_file, resolved_vocab_file))
+            # Instantiate tokenizer.
+            tokenizer = cls(resolved_vocab_file, do_lower_case)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name))
+            tokenizer = None
+        return tokenizer
+
+
 class BasicTokenizer(object):
-  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
 
-  def __init__(self, do_lower_case=True):
-    """Constructs a BasicTokenizer.
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
 
     Args:
       do_lower_case: Whether to lower case the input.
     """
-    self.do_lower_case = do_lower_case
-
-  def tokenize(self, text):
-    """Tokenizes a piece of text."""
-    text = convert_to_unicode(text)
-    text = self._clean_text(text)
-
-    # This was added on November 1st, 2018 for the multilingual and Chinese
-    # models. This is also applied to the English models now, but it doesn't
-    # matter since the English models were not trained on any Chinese data
-    # and generally don't have any Chinese data in them (there are Chinese
-    # characters in the vocabulary because Wikipedia does have some Chinese
-    # words in the English Wikipedia.).
-    text = self._tokenize_chinese_chars(text)
-
-    orig_tokens = whitespace_tokenize(text)
-    split_tokens = []
-    for token in orig_tokens:
-      if self.do_lower_case:
-        token = token.lower()
-        token = self._run_strip_accents(token)
-      split_tokens.extend(self._run_split_on_punc(token))
-
-    output_tokens = whitespace_tokenize(" ".join(split_tokens))
-    return output_tokens
-
-  def _run_strip_accents(self, text):
-    """Strips accents from a piece of text."""
-    text = unicodedata.normalize("NFD", text)
-    output = []
-    for char in text:
-      cat = unicodedata.category(char)
-      if cat == "Mn":
-        continue
-      output.append(char)
-    return "".join(output)
-
-  def _run_split_on_punc(self, text):
-    """Splits punctuation on a piece of text."""
-    chars = list(text)
-    i = 0
-    start_new_word = True
-    output = []
-    while i < len(chars):
-      char = chars[i]
-      if _is_punctuation(char):
-        output.append([char])
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
         start_new_word = True
-      else:
-        if start_new_word:
-          output.append([])
-        start_new_word = False
-        output[-1].append(char)
-      i += 1
-
-    return ["".join(x) for x in output]
-
-  def _tokenize_chinese_chars(self, text):
-    """Adds whitespace around any CJK character."""
-    output = []
-    for char in text:
-      cp = ord(char)
-      if self._is_chinese_char(cp):
-        output.append(" ")
-        output.append(char)
-        output.append(" ")
-      else:
-        output.append(char)
-    return "".join(output)
-
-  def _is_chinese_char(self, cp):
-    """Checks whether CP is the codepoint of a CJK character."""
-    # This defines a "chinese character" as anything in the CJK Unicode block:
-    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-    #
-    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-    # despite its name. The modern Korean Hangul alphabet is a different block,
-    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-    # space-separated words, so they are not treated specially and handled
-    # like the all of the other languages.
-    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-        (cp >= 0x3400 and cp <= 0x4DBF) or  #
-        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-        (cp >= 0x2B820 and cp <= 0x2CEAF) or
-        (cp >= 0xF900 and cp <= 0xFAFF) or  #
-        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-      return True
-
-    return False
-
-  def _clean_text(self, text):
-    """Performs invalid character removal and whitespace cleanup on text."""
-    output = []
-    for char in text:
-      cp = ord(char)
-      if cp == 0 or cp == 0xfffd or _is_control(char):
-        continue
-      if _is_whitespace(char):
-        output.append(" ")
-      else:
-        output.append(char)
-    return "".join(output)
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
 
 
 class WordpieceTokenizer(object):
-  """Runs WordPiece tokenziation."""
+    """Runs WordPiece tokenization."""
 
-  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
-    self.vocab = vocab
-    self.unk_token = unk_token
-    self.max_input_chars_per_word = max_input_chars_per_word
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
 
-  def tokenize(self, text):
-    """Tokenizes a piece of text into its word pieces.
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
 
     This uses a greedy longest-match-first algorithm to perform tokenization
     using the given vocabulary.
@@ -323,77 +375,77 @@ class WordpieceTokenizer(object):
       A list of wordpiece tokens.
     """
 
-    text = convert_to_unicode(text)
-
-    output_tokens = []
-    for token in whitespace_tokenize(text):
-      chars = list(token)
-      if len(chars) > self.max_input_chars_per_word:
-        output_tokens.append(self.unk_token)
-        continue
-
-      is_bad = False
-      start = 0
-      sub_tokens = []
-      while start < len(chars):
-        end = len(chars)
-        cur_substr = None
-        while start < end:
-          substr = "".join(chars[start:end])
-          if start > 0:
-            substr = "##" + substr
-          if substr in self.vocab:
-            cur_substr = substr
-            break
-          end -= 1
-        if cur_substr is None:
-          is_bad = True
-          break
-        sub_tokens.append(cur_substr)
-        start = end
-
-      if is_bad:
-        output_tokens.append(self.unk_token)
-      else:
-        output_tokens.extend(sub_tokens)
-    return output_tokens
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
 
 
 def _is_whitespace(char):
-  """Checks whether `chars` is a whitespace character."""
-  # \t, \n, and \r are technically contorl characters but we treat them
-  # as whitespace since they are generally considered as such.
-  if char == " " or char == "\t" or char == "\n" or char == "\r":
-    return True
-  cat = unicodedata.category(char)
-  if cat == "Zs":
-    return True
-  return False
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
 
 
 def _is_control(char):
-  """Checks whether `chars` is a control character."""
-  # These are technically control characters but we count them as whitespace
-  # characters.
-  if char == "\t" or char == "\n" or char == "\r":
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
     return False
-  cat = unicodedata.category(char)
-  if cat in ("Cc", "Cf"):
-    return True
-  return False
 
 
 def _is_punctuation(char):
-  """Checks whether `chars` is a punctuation character."""
-  cp = ord(char)
-  # We treat all non-letter/number ASCII as punctuation.
-  # Characters such as "^", "$", and "`" are not in the Unicode
-  # Punctuation class but we treat them as punctuation anyways, for
-  # consistency.
-  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-    return True
-  cat = unicodedata.category(char)
-  if cat.startswith("P"):
-    return True
-  return False
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False

+ 13 - 0
TensorFlow/LanguageModeling/BERT/utils/create_glue_data.py

@@ -1,3 +1,16 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+ 341 - 282
TensorFlow/LanguageModeling/BERT/utils/create_pretraining_data.py

@@ -1,4 +1,5 @@
 # coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,54 +13,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Create masked LM/next sentence masked_lm TF examples for BERT."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 
-import collections
+import argparse
+import logging
+import os
 import random
-import tokenization
+from io import open
+import h5py
 import tensorflow as tf
+import numpy as np
+from tqdm import tqdm, trange
 
-flags = tf.flags
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("input_file", None,
-                    "Input raw text file (or comma-separated list of files).")
-
-flags.DEFINE_string(
-    "output_file", None,
-    "Output TF example file (or comma-separated list of files).")
-
-flags.DEFINE_string("vocab_file", None,
-                    "The vocabulary file that the BERT model was trained on.")
-
-flags.DEFINE_bool(
-    "do_lower_case", True,
-    "Whether to lower case the input text. Should be True for uncased "
-    "models and False for cased models.")
-
-flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
-
-flags.DEFINE_integer("max_predictions_per_seq", 20,
-                     "Maximum number of masked LM predictions per sequence.")
-
-flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
-
-flags.DEFINE_integer(
-    "dupe_factor", 10,
-    "Number of times to duplicate the input data (with different masks).")
-
-flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
-
-flags.DEFINE_float(
-    "short_seq_prob", 0.1,
-    "Probability of creating sequences which are shorter than the "
-    "maximum length.")
+from tokenization import BertTokenizer
+import tokenization as tokenization
 
+import random
+import collections
 
 class TrainingInstance(object):
   """A single training instance (sentence pair)."""
@@ -90,7 +63,7 @@ class TrainingInstance(object):
 
 
 def write_instance_to_example_files(instances, tokenizer, max_seq_length,
-                                    max_predictions_per_seq, output_files):
+                                    max_predictions_per_seq, output_files, output_formats="tfrecord"):
   """Create TF example files from `TrainingInstance`s."""
   writers = []
   for output_file in output_files:
@@ -99,6 +72,16 @@ def write_instance_to_example_files(instances, tokenizer, max_seq_length,
   writer_index = 0
 
   total_written = 0
+  if 'hdf5' in output_formats:
+    features_hdf5 = collections.OrderedDict()
+    num_instances = len(instances)
+    features_hdf5["input_ids"] = np.zeros([num_instances, max_seq_length], dtype="int32")
+    features_hdf5["input_mask"] = np.zeros([num_instances, max_seq_length], dtype="int32")
+    features_hdf5["segment_ids"] = np.zeros([num_instances, max_seq_length], dtype="int32")
+    features_hdf5["masked_lm_positions"] =  np.zeros([num_instances, max_predictions_per_seq], dtype="int32")
+    features_hdf5["masked_lm_ids"] = np.zeros([num_instances, max_predictions_per_seq], dtype="int32")
+    features_hdf5["next_sentence_labels"] = np.zeros(num_instances, dtype="int32")
+
   for (inst_index, instance) in enumerate(instances):
     input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
     input_mask = [1] * len(input_ids)
@@ -134,9 +117,19 @@ def write_instance_to_example_files(instances, tokenizer, max_seq_length,
     features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
     features["next_sentence_labels"] = create_int_feature([next_sentence_label])
 
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    if 'tfrecord' in output_formats:
+      tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+      writers[writer_index].write(tf_example.SerializeToString())
+    if 'hdf5' in output_formats:
+      features_hdf5["input_ids"][inst_index] = input_ids
+      features_hdf5["input_mask"][inst_index] = input_mask
+      features_hdf5["segment_ids"][inst_index] = segment_ids
+      features_hdf5["masked_lm_positions"][inst_index] = masked_lm_positions
+      features_hdf5["masked_lm_ids"][inst_index] = masked_lm_ids
+      features_hdf5["next_sentence_labels"][inst_index] = next_sentence_label
+    if 'tfrecord' not in output_formats and 'hdf5' not in output_formats:
+      assert False, 'Either empty output_formats list or unsupported type specified. Try: tfrecord or hdf5'
 
-    writers[writer_index].write(tf_example.SerializeToString())
     writer_index = (writer_index + 1) % len(writers)
 
     total_written += 1
@@ -159,6 +152,17 @@ def write_instance_to_example_files(instances, tokenizer, max_seq_length,
   for writer in writers:
     writer.close()
 
+  if 'hdf5' in output_formats:
+    f = h5py.File(output_file, 'w')
+    f.create_dataset("input_ids", data=features_hdf5["input_ids"], dtype='i4', compression='gzip')
+    f.create_dataset("input_mask", data=features_hdf5["input_mask"], dtype='i1', compression='gzip')
+    f.create_dataset("segment_ids", data=features_hdf5["segment_ids"], dtype='i1', compression='gzip')
+    f.create_dataset("masked_lm_positions", data=features_hdf5["masked_lm_positions"], dtype='i4', compression='gzip')
+    f.create_dataset("masked_lm_ids", data=features_hdf5["masked_lm_ids"], dtype='i4', compression='gzip')
+    f.create_dataset("next_sentence_labels", data=features_hdf5["next_sentence_labels"], dtype='i1', compression='gzip')
+    f.flush()
+    f.close()
+
   tf.logging.info("Wrote %d total instances", total_written)
 
 
@@ -175,160 +179,161 @@ def create_float_feature(values):
 def create_training_instances(input_files, tokenizer, max_seq_length,
                               dupe_factor, short_seq_prob, masked_lm_prob,
                               max_predictions_per_seq, rng):
-  """Create `TrainingInstance`s from raw text."""
-  all_documents = [[]]
-
-  # Input file format:
-  # (1) One sentence per line. These should ideally be actual sentences, not
-  # entire paragraphs or arbitrary spans of text. (Because we use the
-  # sentence boundaries for the "next sentence prediction" task).
-  # (2) Blank lines between documents. Document boundaries are needed so
-  # that the "next sentence prediction" task doesn't span between documents.
-  for input_file in input_files:
-    with tf.gfile.GFile(input_file, "r") as reader:
-      while True:
-        line = tokenization.convert_to_unicode(reader.readline())
-        if not line:
-          break
-        line = line.strip()
-
-        # Empty lines are used as document delimiters
-        if not line:
-          all_documents.append([])
-        tokens = tokenizer.tokenize(line)
-        if tokens:
-          all_documents[-1].append(tokens)
-
-  # Remove empty documents
-  all_documents = [x for x in all_documents if x]
-  rng.shuffle(all_documents)
-
-  vocab_words = list(tokenizer.vocab.keys())
-  instances = []
-  for _ in range(dupe_factor):
-    for document_index in range(len(all_documents)):
-      instances.extend(
-          create_instances_from_document(
-              all_documents, document_index, max_seq_length, short_seq_prob,
-              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
-
-  rng.shuffle(instances)
-  return instances
+    """Create `TrainingInstance`s from raw text."""
+    all_documents = [[]]
+
+    # Input file format:
+    # (1) One sentence per line. These should ideally be actual sentences, not
+    # entire paragraphs or arbitrary spans of text. (Because we use the
+    # sentence boundaries for the "next sentence prediction" task).
+    # (2) Blank lines between documents. Document boundaries are needed so
+    # that the "next sentence prediction" task doesn't span between documents.
+    for input_file in input_files:
+        print("creating instance from {}".format(input_file))
+        with open(input_file, "r") as reader:
+            while True:
+                line = tokenization.convert_to_unicode(reader.readline())
+                if not line:
+                    break
+                line = line.strip()
+
+                # Empty lines are used as document delimiters
+                if not line:
+                    all_documents.append([])
+                tokens = tokenizer.tokenize(line)
+                if tokens:
+                    all_documents[-1].append(tokens)
+
+    # Remove empty documents
+    all_documents = [x for x in all_documents if x]
+    rng.shuffle(all_documents)
+
+    vocab_words = list(tokenizer.vocab.keys())
+    instances = []
+    for _ in range(dupe_factor):
+        for document_index in range(len(all_documents)):
+            instances.extend(
+                create_instances_from_document(
+                    all_documents, document_index, max_seq_length, short_seq_prob,
+                    masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
+
+    rng.shuffle(instances)
+    return instances
 
 
 def create_instances_from_document(
-    all_documents, document_index, max_seq_length, short_seq_prob,
-    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
-  """Creates `TrainingInstance`s for a single document."""
-  document = all_documents[document_index]
-
-  # Account for [CLS], [SEP], [SEP]
-  max_num_tokens = max_seq_length - 3
-
-  # We *usually* want to fill up the entire sequence since we are padding
-  # to `max_seq_length` anyways, so short sequences are generally wasted
-  # computation. However, we *sometimes*
-  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
-  # sequences to minimize the mismatch between pre-training and fine-tuning.
-  # The `target_seq_length` is just a rough target however, whereas
-  # `max_seq_length` is a hard limit.
-  target_seq_length = max_num_tokens
-  if rng.random() < short_seq_prob:
-    target_seq_length = rng.randint(2, max_num_tokens)
-
-  # We DON'T just concatenate all of the tokens from a document into a long
-  # sequence and choose an arbitrary split point because this would make the
-  # next sentence prediction task too easy. Instead, we split the input into
-  # segments "A" and "B" based on the actual "sentences" provided by the user
-  # input.
-  instances = []
-  current_chunk = []
-  current_length = 0
-  i = 0
-  while i < len(document):
-    segment = document[i]
-    current_chunk.append(segment)
-    current_length += len(segment)
-    if i == len(document) - 1 or current_length >= target_seq_length:
-      if current_chunk:
-        # `a_end` is how many segments from `current_chunk` go into the `A`
-        # (first) sentence.
-        a_end = 1
-        if len(current_chunk) >= 2:
-          a_end = rng.randint(1, len(current_chunk) - 1)
-
-        tokens_a = []
-        for j in range(a_end):
-          tokens_a.extend(current_chunk[j])
-
-        tokens_b = []
-        # Random next
-        is_random_next = False
-        if len(current_chunk) == 1 or rng.random() < 0.5:
-          is_random_next = True
-          target_b_length = target_seq_length - len(tokens_a)
-
-          # This should rarely go for more than one iteration for large
-          # corpora. However, just to be careful, we try to make sure that
-          # the random document is not the same as the document
-          # we're processing.
-          for _ in range(10):
-            random_document_index = rng.randint(0, len(all_documents) - 1)
-            if random_document_index != document_index:
-              break
-
-          random_document = all_documents[random_document_index]
-          random_start = rng.randint(0, len(random_document) - 1)
-          for j in range(random_start, len(random_document)):
-            tokens_b.extend(random_document[j])
-            if len(tokens_b) >= target_b_length:
-              break
-          # We didn't actually use these segments so we "put them back" so
-          # they don't go to waste.
-          num_unused_segments = len(current_chunk) - a_end
-          i -= num_unused_segments
-        # Actual next
-        else:
-          is_random_next = False
-          for j in range(a_end, len(current_chunk)):
-            tokens_b.extend(current_chunk[j])
-        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
-
-        assert len(tokens_a) >= 1
-        assert len(tokens_b) >= 1
-
-        tokens = []
-        segment_ids = []
-        tokens.append("[CLS]")
-        segment_ids.append(0)
-        for token in tokens_a:
-          tokens.append(token)
-          segment_ids.append(0)
-
-        tokens.append("[SEP]")
-        segment_ids.append(0)
-
-        for token in tokens_b:
-          tokens.append(token)
-          segment_ids.append(1)
-        tokens.append("[SEP]")
-        segment_ids.append(1)
-
-        (tokens, masked_lm_positions,
-         masked_lm_labels) = create_masked_lm_predictions(
-             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
-        instance = TrainingInstance(
-            tokens=tokens,
-            segment_ids=segment_ids,
-            is_random_next=is_random_next,
-            masked_lm_positions=masked_lm_positions,
-            masked_lm_labels=masked_lm_labels)
-        instances.append(instance)
-      current_chunk = []
-      current_length = 0
-    i += 1
-
-  return instances
+        all_documents, document_index, max_seq_length, short_seq_prob,
+        masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+    """Creates `TrainingInstance`s for a single document."""
+    document = all_documents[document_index]
+
+    # Account for [CLS], [SEP], [SEP]
+    max_num_tokens = max_seq_length - 3
+
+    # We *usually* want to fill up the entire sequence since we are padding
+    # to `max_seq_length` anyways, so short sequences are generally wasted
+    # computation. However, we *sometimes*
+    # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+    # sequences to minimize the mismatch between pre-training and fine-tuning.
+    # The `target_seq_length` is just a rough target however, whereas
+    # `max_seq_length` is a hard limit.
+    target_seq_length = max_num_tokens
+    if rng.random() < short_seq_prob:
+        target_seq_length = rng.randint(2, max_num_tokens)
+
+    # We DON'T just concatenate all of the tokens from a document into a long
+    # sequence and choose an arbitrary split point because this would make the
+    # next sentence prediction task too easy. Instead, we split the input into
+    # segments "A" and "B" based on the actual "sentences" provided by the user
+    # input.
+    instances = []
+    current_chunk = []
+    current_length = 0
+    i = 0
+    while i < len(document):
+        segment = document[i]
+        current_chunk.append(segment)
+        current_length += len(segment)
+        if i == len(document) - 1 or current_length >= target_seq_length:
+            if current_chunk:
+                # `a_end` is how many segments from `current_chunk` go into the `A`
+                # (first) sentence.
+                a_end = 1
+                if len(current_chunk) >= 2:
+                    a_end = rng.randint(1, len(current_chunk) - 1)
+
+                tokens_a = []
+                for j in range(a_end):
+                    tokens_a.extend(current_chunk[j])
+
+                tokens_b = []
+                # Random next
+                is_random_next = False
+                if len(current_chunk) == 1 or rng.random() < 0.5:
+                    is_random_next = True
+                    target_b_length = target_seq_length - len(tokens_a)
+
+                    # This should rarely go for more than one iteration for large
+                    # corpora. However, just to be careful, we try to make sure that
+                    # the random document is not the same as the document
+                    # we're processing.
+                    for _ in range(10):
+                        random_document_index = rng.randint(0, len(all_documents) - 1)
+                        if random_document_index != document_index:
+                            break
+
+                    random_document = all_documents[random_document_index]
+                    random_start = rng.randint(0, len(random_document) - 1)
+                    for j in range(random_start, len(random_document)):
+                        tokens_b.extend(random_document[j])
+                        if len(tokens_b) >= target_b_length:
+                            break
+                    # We didn't actually use these segments so we "put them back" so
+                    # they don't go to waste.
+                    num_unused_segments = len(current_chunk) - a_end
+                    i -= num_unused_segments
+                # Actual next
+                else:
+                    is_random_next = False
+                    for j in range(a_end, len(current_chunk)):
+                        tokens_b.extend(current_chunk[j])
+                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+                assert len(tokens_a) >= 1
+                assert len(tokens_b) >= 1
+
+                tokens = []
+                segment_ids = []
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                for token in tokens_a:
+                    tokens.append(token)
+                    segment_ids.append(0)
+
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+
+                for token in tokens_b:
+                    tokens.append(token)
+                    segment_ids.append(1)
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+
+                (tokens, masked_lm_positions,
+                 masked_lm_labels) = create_masked_lm_predictions(
+                    tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+                instance = TrainingInstance(
+                    tokens=tokens,
+                    segment_ids=segment_ids,
+                    is_random_next=is_random_next,
+                    masked_lm_positions=masked_lm_positions,
+                    masked_lm_labels=masked_lm_labels)
+                instances.append(instance)
+            current_chunk = []
+            current_length = 0
+        i += 1
+
+    return instances
 
 
 MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
@@ -337,106 +342,160 @@ MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
 
 def create_masked_lm_predictions(tokens, masked_lm_prob,
                                  max_predictions_per_seq, vocab_words, rng):
-  """Creates the predictions for the masked LM objective."""
-
-  cand_indexes = []
-  for (i, token) in enumerate(tokens):
-    if token == "[CLS]" or token == "[SEP]":
-      continue
-    cand_indexes.append(i)
-
-  rng.shuffle(cand_indexes)
-
-  output_tokens = list(tokens)
-
-  num_to_predict = min(max_predictions_per_seq,
-                       max(1, int(round(len(tokens) * masked_lm_prob))))
-
-  masked_lms = []
-  covered_indexes = set()
-  for index in cand_indexes:
-    if len(masked_lms) >= num_to_predict:
-      break
-    if index in covered_indexes:
-      continue
-    covered_indexes.add(index)
-
-    masked_token = None
-    # 80% of the time, replace with [MASK]
-    if rng.random() < 0.8:
-      masked_token = "[MASK]"
-    else:
-      # 10% of the time, keep original
-      if rng.random() < 0.5:
-        masked_token = tokens[index]
-      # 10% of the time, replace with random word
-      else:
-        masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+    """Creates the predictions for the masked LM objective."""
+
+    cand_indexes = []
+    for (i, token) in enumerate(tokens):
+        if token == "[CLS]" or token == "[SEP]":
+            continue
+        cand_indexes.append(i)
+
+    rng.shuffle(cand_indexes)
+
+    output_tokens = list(tokens)
+
+    num_to_predict = min(max_predictions_per_seq,
+                         max(1, int(round(len(tokens) * masked_lm_prob))))
+
+    masked_lms = []
+    covered_indexes = set()
+    for index in cand_indexes:
+        if len(masked_lms) >= num_to_predict:
+            break
+        if index in covered_indexes:
+            continue
+        covered_indexes.add(index)
+
+        masked_token = None
+        # 80% of the time, replace with [MASK]
+        if rng.random() < 0.8:
+            masked_token = "[MASK]"
+        else:
+            # 10% of the time, keep original
+            if rng.random() < 0.5:
+                masked_token = tokens[index]
+            # 10% of the time, replace with random word
+            else:
+                masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
 
-    output_tokens[index] = masked_token
+        output_tokens[index] = masked_token
 
-    masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+        masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
 
-  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
 
-  masked_lm_positions = []
-  masked_lm_labels = []
-  for p in masked_lms:
-    masked_lm_positions.append(p.index)
-    masked_lm_labels.append(p.label)
+    masked_lm_positions = []
+    masked_lm_labels = []
+    for p in masked_lms:
+        masked_lm_positions.append(p.index)
+        masked_lm_labels.append(p.label)
 
-  return (output_tokens, masked_lm_positions, masked_lm_labels)
+    return (output_tokens, masked_lm_positions, masked_lm_labels)
 
 
 def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
-  """Truncates a pair of sequences to a maximum sequence length."""
-  while True:
-    total_length = len(tokens_a) + len(tokens_b)
-    if total_length <= max_num_tokens:
-      break
-
-    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
-    assert len(trunc_tokens) >= 1
-
-    # We want to sometimes truncate from the front and sometimes from the
-    # back to add more randomness and avoid biases.
-    if rng.random() < 0.5:
-      del trunc_tokens[0]
+    """Truncates a pair of sequences to a maximum sequence length."""
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_num_tokens:
+            break
+
+        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+        assert len(trunc_tokens) >= 1
+
+        # We want to sometimes truncate from the front and sometimes from the
+        # back to add more randomness and avoid biases.
+        if rng.random() < 0.5:
+            del trunc_tokens[0]
+        else:
+            trunc_tokens.pop()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--vocab_file",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The vocabulary the BERT model will train on.")
+    parser.add_argument("--input_file",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input train corpus. can be directory with .txt files or a path to a single file")
+    parser.add_argument("--output_file",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output file where the model checkpoints will be written.")
+
+    ## Other parameters
+    # int
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--dupe_factor",
+                        default=10,
+                        type=int,
+                        help="Number of times to duplicate the input data (with different masks).")
+    parser.add_argument("--max_predictions_per_seq",
+                        default=20,
+                        type=int,
+                        help="Maximum sequence length.")
+
+    # floats
+
+    parser.add_argument("--masked_lm_prob",
+                        default=0.15,
+                        type=float,
+                        help="Masked LM probability.")
+
+    parser.add_argument("--short_seq_prob",
+                        default=0.1,
+                        type=float,
+                        help="Probability to create a sequence shorter than maximum sequence length")
+
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        default=True,
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+    parser.add_argument('--random_seed',
+                        type=int,
+                        default=12345,
+                        help="random seed for initialization")
+
+    args = parser.parse_args()
+
+    tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case)
+
+    input_files = []
+    if os.path.isfile(args.input_file):
+        input_files.append(args.input_file)
+    elif os.path.isdir(args.input_file):
+        input_files = [os.path.join(args.input_file, f) for f in os.listdir(args.input_file) if
+                       (os.path.isfile(os.path.join(args.input_file, f)) and f.endswith('.txt'))]
     else:
-      trunc_tokens.pop()
-
-
-def main(_):
-  tf.logging.set_verbosity(tf.logging.INFO)
+        raise ValueError("{} is not a valid path".format(args.input_file))
 
-  tokenizer = tokenization.FullTokenizer(
-      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    rng = random.Random(args.random_seed)
+    instances = create_training_instances(
+        input_files, tokenizer, args.max_seq_length, args.dupe_factor,
+        args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq,
+        rng)
 
-  input_files = []
-  for input_pattern in FLAGS.input_file.split(","):
-    input_files.extend(tf.gfile.Glob(input_pattern))
+    output_files = args.output_file.split(",")
+    print("*** Writing to output files ***")
+    for output_file in output_files:
+        print(output_file)
 
-  tf.logging.info("*** Reading from input files ***")
-  for input_file in input_files:
-    tf.logging.info("  %s", input_file)
-
-  rng = random.Random(FLAGS.random_seed)
-  instances = create_training_instances(
-      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
-      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
-      rng)
-
-  output_files = FLAGS.output_file.split(",")
-  tf.logging.info("*** Writing to output files ***")
-  for output_file in output_files:
-    tf.logging.info("  %s", output_file)
 
-  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
-                                  FLAGS.max_predictions_per_seq, output_files)
+    write_instance_to_example_files(instances, tokenizer, args.max_seq_length,
+                                       args.max_predictions_per_seq, output_files)
 
 
 if __name__ == "__main__":
-  flags.mark_flag_as_required("input_file")
-  flags.mark_flag_as_required("output_file")
-  flags.mark_flag_as_required("vocab_file")
-  tf.app.run()
+    main()

+ 13 - 0
TensorFlow/LanguageModeling/BERT/utils/create_squad_data.py

@@ -1,3 +1,16 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+ 13 - 0
TensorFlow/LanguageModeling/BERT/utils/utils.py

@@ -1,3 +1,16 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import tensorflow as tf
 import time
 

Alguns ficheiros não foram mostrados porque muitos ficheiros mudaram neste diff