Jelajahi Sumber

[BERT/TF] TRT int8 and Triton

kkudrynski 5 tahun lalu
induk
melakukan
36a6985ebc
61 mengubah file dengan 510 tambahan dan 3079 penghapusan
  1. 6 0
      TensorFlow/LanguageModeling/BERT/.gitignore
  2. 1 1
      TensorFlow/LanguageModeling/BERT/Dockerfile
  3. 1 1
      TensorFlow/LanguageModeling/BERT/README.md
  4. 1 1
      TensorFlow/LanguageModeling/BERT/biobert/conlleval.py
  5. 1 2
      TensorFlow/LanguageModeling/BERT/biobert/scripts/ner_bc5cdr-chem.sh
  6. 1 1
      TensorFlow/LanguageModeling/BERT/biobert/scripts/ner_bc5cdr-disease.sh
  7. 1 1
      TensorFlow/LanguageModeling/BERT/biobert/scripts/rel_chemprot.sh
  8. 1 1
      TensorFlow/LanguageModeling/BERT/biobert/scripts/run_biobert_finetuning_inference.sh
  9. 21 1
      TensorFlow/LanguageModeling/BERT/data/create_datasets_from_start.sh
  10. TEMPAT SAMPAH
      TensorFlow/LanguageModeling/BERT/data/images/bert_triton_dynamic_batching_a100.png
  11. TEMPAT SAMPAH
      TensorFlow/LanguageModeling/BERT/data/images/bert_trt_throughput_vs_latency.png
  12. TEMPAT SAMPAH
      TensorFlow/LanguageModeling/BERT/data/images/triton_base_summary.png
  13. TEMPAT SAMPAH
      TensorFlow/LanguageModeling/BERT/data/images/triton_bs_1.png
  14. TEMPAT SAMPAH
      TensorFlow/LanguageModeling/BERT/data/images/triton_bs_8.png
  15. TEMPAT SAMPAH
      TensorFlow/LanguageModeling/BERT/data/images/triton_dynamic.png
  16. TEMPAT SAMPAH
      TensorFlow/LanguageModeling/BERT/data/images/triton_ec_1.png
  17. TEMPAT SAMPAH
      TensorFlow/LanguageModeling/BERT/data/images/triton_ec_4.png
  18. TEMPAT SAMPAH
      TensorFlow/LanguageModeling/BERT/data/images/triton_large_summary.png
  19. TEMPAT SAMPAH
      TensorFlow/LanguageModeling/BERT/data/images/triton_static.png
  20. 8 10
      TensorFlow/LanguageModeling/BERT/run_classifier.py
  21. 18 21
      TensorFlow/LanguageModeling/BERT/run_ner.py
  22. 2 2
      TensorFlow/LanguageModeling/BERT/run_pretraining.py
  23. 5 8
      TensorFlow/LanguageModeling/BERT/run_re.py
  24. 26 17
      TensorFlow/LanguageModeling/BERT/run_squad.py
  25. 1 1
      TensorFlow/LanguageModeling/BERT/scripts/docker/build.sh
  26. 3 3
      TensorFlow/LanguageModeling/BERT/scripts/finetune_inference_benchmark.sh
  27. 2 2
      TensorFlow/LanguageModeling/BERT/scripts/finetune_train_benchmark.sh
  28. 2 2
      TensorFlow/LanguageModeling/BERT/scripts/run_glue.sh
  29. 3 3
      TensorFlow/LanguageModeling/BERT/scripts/run_glue_inference.sh
  30. 2 2
      TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_adam.sh
  31. 2 2
      TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase1.sh
  32. 2 2
      TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase2.sh
  33. 2 2
      TensorFlow/LanguageModeling/BERT/scripts/run_squad.sh
  34. 3 4
      TensorFlow/LanguageModeling/BERT/scripts/run_squad_inference.sh
  35. 183 58
      TensorFlow/LanguageModeling/BERT/triton/README.md
  36. 46 25
      TensorFlow/LanguageModeling/BERT/triton/run_squad_triton_client.py
  37. 2 3
      TensorFlow/LanguageModeling/BERT/triton/scripts/export_model.sh
  38. 0 115
      TensorFlow/LanguageModeling/BERT/triton/scripts/generate_figures.sh
  39. 4 3
      TensorFlow/LanguageModeling/BERT/triton/scripts/launch_server.sh
  40. 1 2
      TensorFlow/LanguageModeling/BERT/triton/scripts/run_client.sh
  41. 7 8
      TensorFlow/LanguageModeling/BERT/triton/scripts/run_perf_client.sh
  42. 7 15
      TensorFlow/LanguageModeling/BERT/triton/scripts/run_triton_tf.sh
  43. 82 0
      TensorFlow/LanguageModeling/BERT/triton/scripts/run_triton_trt.sh
  44. 47 0
      TensorFlow/LanguageModeling/BERT/triton/scripts/triton_data_download.sh
  45. 2 1
      TensorFlow/LanguageModeling/BERT/triton/scripts/wait_for_triton_server.sh
  46. 0 2
      TensorFlow/LanguageModeling/BERT/trt/.dockerignore
  47. 0 67
      TensorFlow/LanguageModeling/BERT/trt/Dockerfile
  48. 0 353
      TensorFlow/LanguageModeling/BERT/trt/README.md
  49. 0 535
      TensorFlow/LanguageModeling/BERT/trt/builder.py
  50. 0 0
      TensorFlow/LanguageModeling/BERT/trt/helpers/__init__.py
  51. 0 493
      TensorFlow/LanguageModeling/BERT/trt/helpers/data_processing.py
  52. 0 429
      TensorFlow/LanguageModeling/BERT/trt/helpers/tokenization.py
  53. 0 357
      TensorFlow/LanguageModeling/BERT/trt/inference.ipynb
  54. 0 222
      TensorFlow/LanguageModeling/BERT/trt/inference.py
  55. 0 117
      TensorFlow/LanguageModeling/BERT/trt/perf.py
  56. 0 4
      TensorFlow/LanguageModeling/BERT/trt/requirements.txt
  57. 0 22
      TensorFlow/LanguageModeling/BERT/trt/scripts/build.sh
  58. 0 61
      TensorFlow/LanguageModeling/BERT/trt/scripts/download_model.sh
  59. 0 66
      TensorFlow/LanguageModeling/BERT/trt/scripts/inference_benchmark.sh
  60. 0 31
      TensorFlow/LanguageModeling/BERT/trt/scripts/launch.sh
  61. 14 0
      TensorFlow/LanguageModeling/BERT/utils/utils.py

+ 6 - 0
TensorFlow/LanguageModeling/BERT/.gitignore

@@ -4,6 +4,12 @@
 __pycache__/
 *.py[cod]
 *$py.class
+*.png
+.idea/
+*swp
+data/
+checkpoints/
+data_dl/
 
 # C extensions
 *.so

+ 1 - 1
TensorFlow/LanguageModeling/BERT/Dockerfile

@@ -17,7 +17,7 @@ RUN git clone https://github.com/titipata/pubmed_parser
 RUN pip3 install /workspace/pubmed_parser
 
 #Copy the perf_client over
-ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v2.0.0/v2.0.0_ubuntu1804.clients.tar.gz
+ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v2.2.0/v2.2.0_ubuntu1804.clients.tar.gz
 RUN mkdir -p /workspace/install \
     && curl -L ${TRTIS_CLIENTS_URL} | tar xvz -C /workspace/install
 

+ 1 - 1
TensorFlow/LanguageModeling/BERT/README.md

@@ -273,7 +273,7 @@ Note: Not using BookCorpus can potentially change final accuracy on a few downst
 
 4. Download the pretrained models from NGC.
 
-We have uploaded checkpoints that have been [fine tuned](https://ngc.nvidia.com/catalog/models/nvidia:bert_tf_v1_1_large_fp32_384) and [pre-trained](https://ngc.nvidia.com/catalog/models/nvidia:bert_tf_pretraining_lamb_16n) for various configurations on the NGC Model Registry. You can browse and download the relevant checkpoints directly from the [NGC model catalog](https://ngc.nvidia.com/catalog/models). Download them to the `results/models/` to easily access them in your scripts. 
+We have uploaded checkpoints that have been [fine tuned](https://ngc.nvidia.com/catalog/models/nvidia:bert_tf_v1_1_large_fp16_384) and [pre-trained](https://ngc.nvidia.com/catalog/models/nvidia:bert_tf_pretraining_lamb_16n) for various configurations on the NGC Model Registry. Our data download scripts, by default download some of them but you can browse and download the relevant checkpoints directly from the [NGC model catalog](https://ngc.nvidia.com/catalog/models). Download them to the `data/download/nvidia_pretrained/` to easily access them in your scripts. 
 
 5. Start an interactive session in the NGC container to run training/inference.
 

+ 1 - 1
TensorFlow/LanguageModeling/BERT/biobert/conlleval.py

@@ -22,7 +22,7 @@ ANY_SPACE = '<SPACE>'
 class FormatError(Exception):
     pass
 
-Metrics = namedtuple('Metrics', 'tp fp fn precision recall f1')
+Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
 
 
 class EvalCounts(object):

+ 1 - 2
TensorFlow/LanguageModeling/BERT/biobert/scripts/ner_bc5cdr-chem.sh

@@ -2,7 +2,7 @@
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
-init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
+init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt"}
 train_batch_size=${2:-8}
 learning_rate=${3:-3.125e-6}
 cased=${4:-false}
@@ -30,7 +30,6 @@ else
     export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
 fi
 
-
 export GBS=$(expr $train_batch_size \* $num_gpu)
 printf -v TAG "tf_bert_biobert_ner_bc5cdr_chem_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
 DATESTAMP=`date +'%y%m%d%H%M%S'`

+ 1 - 1
TensorFlow/LanguageModeling/BERT/biobert/scripts/ner_bc5cdr-disease.sh

@@ -2,7 +2,7 @@
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
-init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
+init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt"}
 train_batch_size=${2:-8}
 learning_rate=${3:-3.125e-6}
 cased=${4:-false}

+ 1 - 1
TensorFlow/LanguageModeling/BERT/biobert/scripts/rel_chemprot.sh

@@ -2,7 +2,7 @@
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
-init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
+init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt"}
 train_batch_size=${2:-8}
 learning_rate=${3:-1.5e-6}
 cased=${4:-false}

+ 1 - 1
TensorFlow/LanguageModeling/BERT/biobert/scripts/run_biobert_finetuning_inference.sh

@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 task=${1:-"ner_bc5cdr-chem"}
-init_checkpoint=${2:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
+init_checkpoint=${2:-"/results/biobert_tf_uncased_base/model.ckpt"}
 bert_model=${3:-"base"}
 cased=${4:-"false"}
 precision=${5:-"fp16"}

+ 21 - 1
TensorFlow/LanguageModeling/BERT/data/create_datasets_from_start.sh

@@ -23,10 +23,30 @@ if [ "$to_download" = "wiki_books" ] ; then
 fi
 
 python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
-python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights  # Includes vocab
 python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
 python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc
 python3 /workspace/bert/data/bertPrep.py --action download --dataset sst-2
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset google_pretrained_weights
+
+mkdir -p /workspace/bert/data/download/nvidia_pretrained
+#SQuAD Large Checkpoint
+	echo "Downloading SQuAD Large Checkpoint"
+	cd /workspace/bert/data/download/nvidia_pretrained && \
+		wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_ckpt_large_qa_squad11_amp_384/versions/19.03.1/zip -O bert_tf_ckpt_large_qa_squad11_amp_384_19.03.1.zip \
+		 && unzip bert_tf_ckpt_large_qa_squad11_amp_384_19.03.1.zip -d bert_tf_squad11_large_384 && rm bert_tf_ckpt_large_qa_squad11_amp_384_19.03.1.zip
+
+#SQuAD Base Checkpoint
+cd /workspace/bert/data/download/nvidia_pretrained && \
+	wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_ckpt_base_qa_squad11_amp_128/versions/19.03.1/zip -O bert_tf_ckpt_base_qa_squad11_amp_128_19.03.1.zip \
+	 && unzip bert_tf_ckpt_base_qa_squad11_amp_128_19.03.1.zip -d bert_tf_squad11_base_128 && rm bert_tf_ckpt_base_qa_squad11_amp_128_19.03.1.zip
+
+#Pretraining Large checkpoint
+cd /workspace/bert/data/download/nvidia_pretrained && \
+	wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_ckpt_large_pretraining_amp_lamb/versions/19.03.1/zip -O bert_tf_ckpt_large_pretraining_amp_lamb_19.03.1.zip \
+	&& unzip bert_tf_ckpt_large_pretraining_amp_lamb_19.03.1.zip -d bert_tf_pretraining_large_lamb && rm bert_tf_ckpt_large_pretraining_amp_lamb_19.03.1.zip
+
+python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights  # Redundant, to verify and remove
+
 
 DATASET="wikicorpus_en"
 # Properly format the text files

TEMPAT SAMPAH
TensorFlow/LanguageModeling/BERT/data/images/bert_triton_dynamic_batching_a100.png


TEMPAT SAMPAH
TensorFlow/LanguageModeling/BERT/data/images/bert_trt_throughput_vs_latency.png


TEMPAT SAMPAH
TensorFlow/LanguageModeling/BERT/data/images/triton_base_summary.png


TEMPAT SAMPAH
TensorFlow/LanguageModeling/BERT/data/images/triton_bs_1.png


TEMPAT SAMPAH
TensorFlow/LanguageModeling/BERT/data/images/triton_bs_8.png


TEMPAT SAMPAH
TensorFlow/LanguageModeling/BERT/data/images/triton_dynamic.png


TEMPAT SAMPAH
TensorFlow/LanguageModeling/BERT/data/images/triton_ec_1.png


TEMPAT SAMPAH
TensorFlow/LanguageModeling/BERT/data/images/triton_ec_4.png


TEMPAT SAMPAH
TensorFlow/LanguageModeling/BERT/data/images/triton_large_summary.png


TEMPAT SAMPAH
TensorFlow/LanguageModeling/BERT/data/images/triton_static.png


+ 8 - 10
TensorFlow/LanguageModeling/BERT/run_classifier.py

@@ -29,7 +29,7 @@ import tokenization
 import tensorflow as tf
 import horovod.tensorflow as hvd
 import time
-from utils.utils import LogEvalRunHook, LogTrainRunHook
+from utils.utils import LogEvalRunHook, LogTrainRunHook, setup_xla_flags
 import utils.dllogger_class
 from dllogger import Verbosity
 from utils.create_glue_data import *
@@ -455,11 +455,8 @@ def input_fn_builder(features, batch_size, seq_length, is_training, drop_remaind
 
 
 def main(_):
-  # causes memory fragmentation for bert leading to OOM
-  if os.environ.get("TF_XLA_FLAGS", None) is not None:
-    os.environ["TF_XLA_FLAGS"] += " --tf_xla_enable_lazy_compilation false"
-  else:
-    os.environ["TF_XLA_FLAGS"] = " --tf_xla_enable_lazy_compilation false"
+
+  setup_xla_flags()
 
   tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
   dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
@@ -518,7 +515,8 @@ def main(_):
           training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
   if FLAGS.use_xla:
     config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
-    tf.enable_resource_variables()
+    if FLAGS.amp:
+      tf.enable_resource_variables()
 
   run_config = tf.estimator.RunConfig(
       model_dir=FLAGS.output_dir if master_process else None,
@@ -537,7 +535,7 @@ def main(_):
   train_examples = None
   num_train_steps = None
   num_warmup_steps = None
-  training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank, FLAGS.save_checkpoints_steps, num_steps_ignore_xla=10))
+  training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank, FLAGS.save_checkpoints_steps, num_steps_ignore_xla=25))
 
   if FLAGS.do_train:
     train_examples = processor.get_train_examples(FLAGS.data_dir)
@@ -597,14 +595,14 @@ def main(_):
     train_time_elapsed = time.time() - train_start_time
     train_time_wo_overhead = training_hooks[-1].total_time
     avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
-    ss_sentences_per_second = (num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead
+    ss_sentences_per_second = (training_hooks[-1].count - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead
 
     if master_process:
         tf.compat.v1.logging.info("-----------------------------")
         tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
                         num_train_steps * global_batch_size)
         tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
-                        (num_train_steps - training_hooks[-1].skipped) * global_batch_size)
+                        (training_hooks[-1].count - training_hooks[-1].skipped) * global_batch_size)
         tf.compat.v1.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
         tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
         tf.compat.v1.logging.info("-----------------------------")

+ 18 - 21
TensorFlow/LanguageModeling/BERT/run_ner.py

@@ -25,7 +25,7 @@ import tf_metrics
 
 import time
 import horovod.tensorflow as hvd
-from utils.utils import LogEvalRunHook, LogTrainRunHook
+from utils.utils import LogEvalRunHook, LogTrainRunHook, setup_xla_flags
 import utils.dllogger_class
 from dllogger import Verbosity
 
@@ -272,7 +272,7 @@ class CLEFEProcessor(DataProcessor):
 
     @classmethod
     def _read_data2(cls, input_file):
-        with tf.io.gfile.Open(input_file, "r") as f:
+        with tf.io.gfile.GFile(input_file, "r") as f:
             lines = []
             words = []
             labels = []
@@ -306,10 +306,10 @@ class I2b22012Processor(CLEFEProcessor):
 def write_tokens(tokens, labels, mode):
     if mode == "test":
         path = os.path.join(FLAGS.output_dir, "token_" + mode + ".txt")
-        if tf.io.gfile.Exists(path):
-            wf = tf.io.gfile.Open(path, 'a')
+        if tf.io.gfile.exists(path):
+            wf = tf.io.gfile.GFile(path, 'a')
         else:
-            wf = tf.io.gfile.Open(path, 'w')
+            wf = tf.io.gfile.GFile(path, 'w')
         for token, label in zip(tokens, labels):
             if token != "**NULL**":
                 wf.write(token + ' ' + str(label) + '\n')
@@ -626,11 +626,7 @@ def result_to_pair(predict_line, pred_ids, id2label, writer, err_writer):
 
 
 def main(_):
-    # causes memory fragmentation for bert leading to OOM
-    if os.environ.get("TF_XLA_FLAGS", None) is not None:
-        os.environ["TF_XLA_FLAGS"] += " --tf_xla_enable_lazy_compilation false"
-    else:
-        os.environ["TF_XLA_FLAGS"] = " --tf_xla_enable_lazy_compilation false"
+    setup_xla_flags()
 
     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
     dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
@@ -685,7 +681,8 @@ def main(_):
 
     if FLAGS.use_xla:
         config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
-        tf.enable_resource_variables()
+        if FLAGS.amp:
+            tf.enable_resource_variables()
     run_config = tf.estimator.RunConfig(
       model_dir=FLAGS.output_dir if master_process else None,
       session_config=config,
@@ -795,11 +792,11 @@ def main(_):
             drop_remainder=eval_drop_remainder)
         result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
         output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
-        with tf.io.gfile.Open(output_eval_file, "w") as writer:
+        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
             tf.compat.v1.logging.info("***** Eval results *****")
             for key in sorted(result.keys()):
                 tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
-                dllogging.logger.log(step=(), data={key: float(strresult[key])}, verbosity=Verbosity.DEFAULT)
+                dllogging.logger.log(step=(), data={key: float(str(result[key]))}, verbosity=Verbosity.DEFAULT)
                 writer.write("%s = %s\n" % (key, str(result[key])))
     if FLAGS.do_predict and master_process:
         predict_examples = processor.get_test_examples(FLAGS.data_dir)
@@ -808,12 +805,12 @@ def main(_):
                                                  FLAGS.max_seq_length, tokenizer,
                                                  predict_file, mode="test")
 
-        with tf.io.gfile.Open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf:
+        with tf.io.gfile.GFile(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf:
             label2id = pickle.load(rf)
             id2label = {value: key for key, value in label2id.items()}
         token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
-        if tf.io.gfile.Exists(token_path):
-            tf.io.gfile.Remove(token_path)
+        if tf.io.gfile.exists(token_path):
+            tf.io.gfile.remove(token_path)
 
         tf.compat.v1.logging.info("***** Running prediction*****")
         tf.compat.v1.logging.info("  Num examples = %d", len(predict_examples))
@@ -833,9 +830,9 @@ def main(_):
         output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")
         test_labels_file = os.path.join(FLAGS.output_dir, "test_labels.txt")
         test_labels_err_file = os.path.join(FLAGS.output_dir, "test_labels_errs.txt")
-        with tf.io.gfile.Open(output_predict_file, 'w') as writer, \
-                tf.io.gfile.Open(test_labels_file, 'w') as tl, \
-                tf.io.gfile.Open(test_labels_err_file, 'w') as tle:
+        with tf.io.gfile.GFile(output_predict_file, 'w') as writer, \
+                tf.io.gfile.GFile(test_labels_file, 'w') as tl, \
+                tf.io.gfile.GFile(test_labels_err_file, 'w') as tle:
             print(id2label)
             i=0
             for prediction in estimator.predict(input_fn=predict_input_fn, hooks=eval_hooks,
@@ -881,11 +878,11 @@ def main(_):
         tf.compat.v1.logging.info("-----------------------------")
 
         tf.compat.v1.logging.info('Reading: %s', test_labels_file)
-        with tf.io.gfile.Open(test_labels_file, "r") as f:
+        with tf.io.gfile.GFile(test_labels_file, "r") as f:
             counts = evaluate(f)
         eval_result = report_notprint(counts)
         print(''.join(eval_result))
-        with tf.io.gfile.Open(os.path.join(FLAGS.output_dir, 'test_results_conlleval.txt'), 'w') as fd:
+        with tf.io.gfile.GFile(os.path.join(FLAGS.output_dir, 'test_results_conlleval.txt'), 'w') as fd:
             fd.write(''.join(eval_result))
 
 

+ 2 - 2
TensorFlow/LanguageModeling/BERT/run_pretraining.py

@@ -26,7 +26,7 @@ import modeling
 import optimization
 import tensorflow as tf
 import glob
-from utils.utils import LogEvalRunHook
+from utils.utils import LogEvalRunHook, setup_xla_flags
 import utils.dllogger_class
 from dllogger import Verbosity
 
@@ -544,7 +544,7 @@ def _decode_record(record, name_to_features):
 
 
 def main(_):
-  os.environ["TF_XLA_FLAGS"] = " --tf_xla_enable_lazy_compilation false" #causes memory fragmentation for bert leading to OOM
+  setup_xla_flags()
 
   tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
   dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

+ 5 - 8
TensorFlow/LanguageModeling/BERT/run_re.py

@@ -35,7 +35,7 @@ import tokenization
 
 import time
 import horovod.tensorflow as hvd
-from utils.utils import LogEvalRunHook, LogTrainRunHook
+from utils.utils import LogEvalRunHook, LogTrainRunHook, setup_xla_flags
 import utils.dllogger_class
 from dllogger import Verbosity
 
@@ -191,7 +191,7 @@ class DataProcessor(object):
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
-        with tf.io.gfile.Open(input_file, "r") as f:
+        with tf.io.gfile.GFile(input_file, "r") as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
             lines = []
             for line in reader:
@@ -732,11 +732,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
 
 
 def main(_):
-    # causes memory fragmentation for bert leading to OOM
-    if os.environ.get("TF_XLA_FLAGS", None) is not None:
-        os.environ["TF_XLA_FLAGS"] += " --tf_xla_enable_lazy_compilation false"
-    else:
-        os.environ["TF_XLA_FLAGS"] = " --tf_xla_enable_lazy_compilation false"
+    setup_xla_flags()
 
     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
     dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
@@ -796,7 +792,8 @@ def main(_):
 
     if FLAGS.use_xla:
         config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
-        tf.enable_resource_variables()
+        if FLAGS.amp:
+            tf.enable_resource_variables()
     run_config = tf.estimator.RunConfig(
       model_dir=FLAGS.output_dir if master_process else None,
       session_config=config,

+ 26 - 17
TensorFlow/LanguageModeling/BERT/run_squad.py

@@ -36,7 +36,7 @@ import modeling
 import optimization
 import tokenization
 from utils.create_squad_data import *
-from utils.utils import LogEvalRunHook, LogTrainRunHook
+from utils.utils import LogEvalRunHook, LogTrainRunHook, setup_xla_flags
 import utils.dllogger_class
 from dllogger import Verbosity
 
@@ -375,7 +375,7 @@ def model_fn_builder(bert_config, init_checkpoint, learning_rate,
             optimization.LAMBOptimizer(learning_rate=0.0), loss_scaler)
 
       predictions = {
-          "unique_ids": unique_ids,
+          "unique_ids": tf.identity(unique_ids),
           "start_logits": start_logits,
           "end_logits": end_logits,
       }
@@ -564,7 +564,6 @@ def get_predictions(all_examples, all_features, all_results, n_best_size, max_an
       else:
         final_text = ""
         seen_predictions[final_text] = True
-
       nbest.append(
           _NbestPrediction(
               text=final_text,
@@ -614,7 +613,12 @@ def get_predictions(all_examples, all_features, all_results, n_best_size, max_an
       score_diff = score_null - best_non_null_entry.start_logit - (
           best_non_null_entry.end_logit)
       scores_diff_json[example.qas_id] = score_diff
-      if score_diff > FLAGS.null_score_diff_threshold:
+
+      try:
+        null_score_diff_threshold = FLAGS.null_score_diff_threshold
+      except:
+        null_score_diff_threshold = 0.0
+      if score_diff > null_score_diff_threshold:
         all_predictions[example.qas_id] = ""
       else:
         all_predictions[example.qas_id] = best_non_null_entry.text
@@ -853,6 +857,19 @@ def export_model(estimator, export_dir, init_checkpoint):
     # Now build the config for Triton. Check to make sure we can overwrite it, if it exists
     config_filename = os.path.join(model_folder, "config.pbtxt")
 
+    optimization_str = ""
+    if FLAGS.amp:
+      optimization_str = r"""
+optimization {
+  execution_accelerators
+  {
+    gpu_execution_accelerator :
+    [ {
+      name : "auto_mixed_precision"
+    } ]
+  }
+}"""
+
     if (os.path.exists(config_filename) and not FLAGS.triton_model_overwrite):
         print("ERROR: Could not save Triton model config. Config file already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model config. Model config: {}".format(config_filename))
         return
@@ -861,6 +878,7 @@ def export_model(estimator, export_dir, init_checkpoint):
 name: "{model_name}"
 platform: "tensorflow_savedmodel"
 max_batch_size: {max_batch_size}
+{optimization_str}
 input [
     {{
         name: "unique_ids"
@@ -900,8 +918,6 @@ input [
 instance_group [
     {{
         count: {engine_count}
-        kind: KIND_GPU
-        gpus: [{gpu_list}]
     }}
 ]"""
 
@@ -924,8 +940,8 @@ dynamic_batching {{
         "max_batch_size": max_batch_size,
         "seq_length": FLAGS.max_seq_length,
         "dynamic_batching": batching_str,
-        "gpu_list": ", ".join([x.name.split(":")[-1] for x in device_lib.list_local_devices() if x.device_type == "GPU"]),
-        "engine_count": FLAGS.triton_engine_count
+        "engine_count": FLAGS.triton_engine_count,
+        "optimization_str":optimization_str,
     }
 
     with open(model_folder + "/config.pbtxt", "w") as file:
@@ -934,14 +950,7 @@ dynamic_batching {{
         file.write(final_config_str)
 
 def main(_):
-  # causes memory fragmentation for bert leading to OOM
-  if os.environ.get("TF_XLA_FLAGS", None) is not None:
-    os.environ["TF_XLA_FLAGS"] += " --tf_xla_enable_lazy_compilation false"
-  else:
-    os.environ["TF_XLA_FLAGS"] = " --tf_xla_enable_lazy_compilation false"
-
-  # Enable async_io to speed up multi-gpu training with XLA and Horovod.
-  os.environ["TF_XLA_FLAGS"] += " --tf_xla_async_io_level 1"
+  setup_xla_flags()
 
   tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
   dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)
@@ -1216,4 +1225,4 @@ def main(_):
 
 if __name__ == "__main__":
   FLAGS = extract_run_squad_flags()
-  tf.app.run()
+  tf.app.run()

+ 1 - 1
TensorFlow/LanguageModeling/BERT/scripts/docker/build.sh

@@ -1,5 +1,5 @@
 #!/bin/bash
 
-docker pull nvcr.io/nvidia/tritonserver:20.06-v1-py3
+docker pull nvcr.io/nvidia/tritonserver:20.09-py3
 
 docker build . --rm -t bert

+ 3 - 3
TensorFlow/LanguageModeling/BERT/scripts/finetune_inference_benchmark.sh

@@ -36,13 +36,13 @@ if [ "$task" = "squad" ] ; then
 
 
       if [ "$bert_model" = "large" ] ; then
-        export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+          export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
       else
-          export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
+          export BERT_DIR=data/download/nvidia_pretrained/bert_tf_squad11_base_128
       fi
       echo  "BERT directory set as " $BERT_DIR
 
-      init_checkpoint="$BERT_DIR/bert_model.ckpt"
+      init_checkpoint="$BERT_DIR/model.ckpt"
 
       for seq_len in 128 384; do
 

+ 2 - 2
TensorFlow/LanguageModeling/BERT/scripts/finetune_train_benchmark.sh

@@ -19,9 +19,9 @@ num_gpu=${3:-"8"}
 task=${4:-"squad"}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
 else
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_squad11_base_128
 fi
 
 echo  "BERT directory set as " $BERT_DIR

+ 2 - 2
TensorFlow/LanguageModeling/BERT/scripts/run_glue.sh

@@ -26,7 +26,7 @@ doc_stride=${8:-"64"}
 bert_model=${9:-"large"}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
 else
     export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
 fi
@@ -35,7 +35,7 @@ export GLUE_DIR=data/download
 
 epochs=${10:-"3.0"}
 ws=${11:-"0.1"}
-init_checkpoint=${12:-"$BERT_DIR/bert_model.ckpt"}
+init_checkpoint=${12:-"$BERT_DIR/model.ckpt"}
 
 echo "GLUE directory set as " $GLUE_DIR " BERT directory set as " $BERT_DIR
 

+ 3 - 3
TensorFlow/LanguageModeling/BERT/scripts/run_glue_inference.sh

@@ -15,7 +15,7 @@
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 task_name=${1:-"MRPC"}
-init_checkpoint=${2:-"$BERT_DIR/bert_model.ckpt"}
+init_checkpoint=${2:-"$BERT_DIR/model.ckpt"}
 batch_size=${3:-"32"}
 precision=${4:-"fp16"}
 use_xla=${5:-"true"}
@@ -24,9 +24,9 @@ doc_stride=${7:-"64"}
 bert_model=${8:-"large"}
 
 if [ "$bert_model" = "large" ] ; then
-    BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
 else
-    BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_squad11_base_128
 fi
 GLUE_DIR=data/download
 

+ 2 - 2
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_adam.sh

@@ -32,9 +32,9 @@ max_pred_per_seq=${13:-80}
 DATA_DIR=data/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pred_per_seq}_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
 else
-    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_config.json
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_squad11_base_128
 fi
 
 PREC=""

+ 2 - 2
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase1.sh

@@ -36,9 +36,9 @@ DATA_DIR=${DATA_DIR:-data}
 RESULTS_DIR=${RESULTS_DIR:-/results}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json
+    export BERT_CONFIG=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb/bert_config.json
 else
-    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_config.json
+    export BERT_CONFIG=data/download/nvidia_pretrained/bert_tf_squad11_base_128/bert_config.json
 fi
 
 PREC=""

+ 2 - 2
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase2.sh

@@ -36,9 +36,9 @@ DATA_DIR=${DATA_DIR:-data}
 RESULTS_DIR=${RESULTS_DIR:-/results}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json
+    export BERT_CONFIG=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb/bert_config.json
 else
-    export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_config.json
+    export BERT_CONFIG=data/download/nvidia_pretrained/bert_tf_squad11_base_128/bert_config.json
 fi
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID

+ 2 - 2
TensorFlow/LanguageModeling/BERT/scripts/run_squad.sh

@@ -25,7 +25,7 @@ doc_stride=${7:-"128"}
 bert_model=${8:-"large"}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
 else
     export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
 fi
@@ -39,7 +39,7 @@ else
     version_2_with_negative="True"
 fi
 
-init_checkpoint=${10:-"$BERT_DIR/bert_model.ckpt"}
+init_checkpoint=${10:-"$BERT_DIR/model.ckpt"}
 epochs=${11:-"2.0"}
 
 echo "Squad directory set as " $SQUAD_DIR " BERT directory set as " $BERT_DIR

+ 3 - 4
TensorFlow/LanguageModeling/BERT/scripts/run_squad_inference.sh

@@ -15,7 +15,7 @@
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
-init_checkpoint=${1:-"/results/model.ckpt"}
+init_checkpoint=${1:-"data/download/nvidia_pretrained/bert_tf_squad11_large_384/model.ckpt"}
 batch_size=${2:-"8"}
 precision=${3:-"fp16"}
 use_xla=${4:-"true"}
@@ -25,9 +25,9 @@ bert_model=${7:-"large"}
 squad_version=${8:-"1.1"}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
 else
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_squad11_base_128
 fi
 
 export SQUAD_DIR=data/download/squad/v${squad_version}
@@ -83,7 +83,6 @@ python run_squad.py \
 --predict_batch_size=$batch_size \
 --max_seq_length=$seq_length \
 --doc_stride=$doc_stride \
---predict_batch_size=$batch_size \
 --output_dir=$RESULTS_DIR \
 "$use_fp16" \
 $use_xla_tag --version_2_with_negative=${version_2_with_negative}

+ 183 - 58
TensorFlow/LanguageModeling/BERT/triton/README.md

@@ -1,110 +1,235 @@
-# Deploying the BERT model using Triton Inference Server
+
+# Deploying the BERT TensorFlow model using Triton Inference Server
+
+This folder contains instructions for deployment and exemplary client application to run inference on
+
+Triton Inference Server as well as detailed performance analysis.
 
 ## Table Of Contents
 
-- [Solution Overview](#solution-overview)
-- [Quick Start Guide](#quick-start-guide)
-- [Performance](#performance)
-  * [Advanced Details](#advanced-details)
+* [Solution Overview](#solution-overview)
+
+* [Setup](#setup)
+
+* [Quick Start Guide](#quick-start-guide)
+
+* [Advanced](#advanced)
+
+* [Running the Triton Inference Server](#running-the-triton-inference-server)
+
+* [Running the Triton Inference Client](#running-the-triton-inference-client)
+
+* [Performance](#performance)
+
+* [Latency vs Throughput for TensorRT Engine](#latency-vs-throughput-for-tensorrt-engine)
+
+* [Dynamic batching support](#dynamic-batching-support)
 
-## Solution Overview
+## Solution overview
 
-The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server) provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
-This subfolder of the BERT TensorFlow repository contains detailed performance analysis as well as scripts to run SQuAD fine-tuning on BERT model using Triton Inference Server.
+The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server) provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP/REST or gRPC endpoint, or by a C API endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
 
-A typical Triton Inference Server pipeline can be broken down into the following 8 steps:
-1. Client serializes the inference request into a message and sends it to the server (Client Send)
-2. Message travels over the network from the client to the server (Network)
-3. Message arrives at server, and is deserialized (Server Receive)
-4. Request is placed on the queue (Server Queue)
-5. Request is removed from the queue and computed (Server Compute)
-6. Completed request is serialized in a message and sent back to the client (Server Send)
-7. Completed message travels over network from the server to the client (Network)
-8. Completed message is deserialized by the client and processed as a completed inference request (Client Receive)
+A typical Triton Inference Server pipeline can be broken down into the following steps:
+
+1. The client serializes the inference request into a message and sends it to the server (Client Send).
+
+2. The message travels over the network from the client to the server (Network).
+
+3. The message arrives at the server, and is deserialized (Server Receive).
+
+4. The request is placed in the queue (Server Queue).
+
+5. The request is removed from the queue and computed (Server Compute).
+
+6. The completed request is serialized in a message and sent back to the client (Server Send).
+
+7. The completed message then travels over the network from the server to the client (Network).
+
+8. The completed message is deserialized by the client and processed as a completed inference request (Client Receive).
 
 Generally, for local clients, steps 1-4 and 6-8 will only occupy a small fraction of time, compared to steps 5-6. As backend deep learning systems like BERT are rarely exposed directly to end users, but instead only interfacing with local front-end servers, for the sake of BERT, we can consider that all clients are local.
-In this section, we will go over how to launch Triton Inference Server and client and get the best performant solution that fits your specific application needs.
 
-Note: The following instructions are run from outside the container and call `docker run` commands as required.
+In this section, we will go over how to launch both the Triton Inference Server and the client and get the best performance solution that fits your specific application needs.
+
+More information on how to perform inference using NVIDIA Triton Inference Server can be found in [triton/README.md](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/triton/README.md).
+
+## Setup
+
+The repository contains a folder `./triton/` with a `Dockerfile` which extends the latest TensorFlow NGC container and encapsulates some dependencies. Ensure you have the following components:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+
+* [TensorFlow NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
+
+* [Triton Inference Server NGC container 20.09](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver)
+
+* [NVIDIA CUDA repository]([https://docs.nvidia.com/cuda/archive/10.2/index.html](https://docs.nvidia.com/cuda/archive/10.2/index.html)) for NVIDIA TensorRT 7.1.3
+
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
 
 ## Quick Start Guide
 
-The `run_triton.sh` script exports the TensorFlow BERT model as a `tensorflow_savedmodel` that Triton Inference Server accepts, builds a matching [Triton Inference Server model config](https://docs.nvidia.com/deeplearning/sdk/triton-inference-server-guide/docs/model_configuration.html#), starts the server on local host in a detached state, runs client on SQuAD v1.1 dataset and then evaluates the validity of predictions on the basis of exact match and F1 score all in one step.
+Running the following scripts will build and launch the container containing all required dependencies for native TensorFlow as well as Triton. This is necessary for running inference and can also be used for data download, processing, and training of the model. For more information on the scripts and arguments, refer to the [Advanced](#advanced) section.
+
+1. Clone the repository.
 
 ```bash
-bash triton/scripts/run_triton.sh <init_checkpoint> <batch_size> <precision> <use_xla> <seq_length> <doc_stride> <bert_model> <squad_version> <triton_version_name> <triton_model_name> <triton_export_model> <triton_dyn_batching_delay> <triton_engine_count> <triton_model_overwrite>
+git clone https://github.com/NVIDIA/DeepLearningExamples
+
+cd DeepLearningExamples/TensorFlow/LanguageModeling/BERT
+
 ```
 
-You can also run inference with a sample by passing `--question` and `--context` arguments to the client.
+2. Build a container that extends NGC TensorFlow, Triton Inference Server, and Triton Inference Client.
 
-## Performance
+```bash
+bash scripts/docker/build.sh
 
-Based on the figures 1 and 2 below, we recommend using the Dynamic Batcher with `max_batch_size = 8`, `max_queue_delay_microseconds` as large as possible to fit within your latency window (the values used below are extremely large to exaggerate their effect), and only 1 instance of the engine. The largest improvements to both throughput and latency come from increasing the batch size due to efficiency gains in the GPU with larger batches. The Dynamic Batcher combines the best of both worlds by efficiently batching together a large number of simultaneous requests, while also keeping latency down for infrequent requests. We recommend only 1 instance of the engine due to the negligible improvement to throughput at the cost of significant increases in latency. Many models can benefit from multiple engine instances but as the figures below show, that is not the case for this model.
+```
+
+3. Download fine-tuned checkpoints and SQuAD dataset.
+
+To download the data to `data/download`, run:  
+  
+
+```bash
+bash scripts/docker/launch.sh triton/scripts/triton_data_download.sh
 
-![](../data/images/triton_base_summary.png?raw=true)
+```
+
+4. Run inference.
 
-Figure 1: Latency vs Throughput for BERT Base, FP16, Sequence Length = 128 using various configurations available in Triton Inference Server
 
-![](../data/images/triton_large_summary.png?raw=true)
+The Triton Inference Server can serve either of the following two BERT models:
 
-Figure 2: Latency vs Throughput for BERT Large, FP16, Sequence Length = 384 using various configurations available in Triton Inference Server
+4.1. TensorFlow SavedModel
 
-### Advanced Details
+The `run_triton_tf.sh` script starts the server on a local host in a detached state, runs the client on the SQuAD v1.1 dataset and then evaluates the validity of predictions on the basis of the exact match and F1 score all in one step.
 
-This section digs deeper into the performance numbers and configurations corresponding to running Triton Inference Server for BERT fine tuning for Question Answering. It explains the tradeoffs in selecting maximum batch sizes, batching techniques and number of inference engines on the same GPU to understand how we arrived at the optimal configuration specified previously.
 
-Results can be reproduced by running `generate_figures.sh`. It exports the TensorFlow BERT model as a `tensorflow_savedmodel` that Triton Inference Server accepts, builds a matching [Triton Inference Server model config](https://docs.nvidia.com/deeplearning/sdk/triton-inference-server-guide/docs/model_configuration.html#), starts the server on localhost in a detached state and runs [perf_client](https://docs.nvidia.com/deeplearning/sdk/triton-inference-server-guide/docs/client.html#performance-example-application) for various configurations.
+The script exports the TensorFlow BERT model checkpoint as a `tensorflow_savedmodel` that Triton Inference Server accepts and builds a matching [Triton Inference Server model config](https://docs.nvidia.com/deeplearning/sdk/triton-inference-server-guide/docs/model_configuration.html) when `triton_export_model` is set to `true`.
 
 ```bash
-bash triton/scripts/generate_figures.sh <bert_model> <seq_length> <precision> <init_checkpoint>
+bash triton/scripts/run_triton_tf.sh <init_checkpoint> <batch_size> <precision> <use_xla> <seq_length> <doc_stride> <bert_model> <squad_version> <triton_version_name> <triton_model_name> <triton_export_model> <triton_dyn_batching_delay> <triton_engine_count> <triton_model_overwrite>
+
 ```
 
-All results below are obtained on a single DGX-1 V100 32GB GPU for BERT Base, Sequence Length = 128 and FP16 precision running on a local server. Latencies are indicated by bar plots using the left axis. Throughput is indicated by the blue line plot using the right axis. X-axis indicates the concurrency - the maximum number of inference requests that can be in the pipeline at any given time. For example, when the concurrency is set to 1, the client waits for an inference request to be completed (Step 8) before it sends another to the server (Step 1).  A high number of concurrent requests can reduce the impact of network latency on overall throughput.
+Refer to the advanced section for details on launching client and server separately for debugging.
 
-#### Maximum batch size
+4.2. TensorRT Model
 
-As we can see in Figure 3, the throughput at BS=1, Client Concurrent Requests = 64 is 119 and in Figure 4, the throughput at BS=8, Client Concurrent Requests = 8 is 517, respectively giving a speedup of ~4.3x
+In order to use the BERT TensorRT engine, follow the steps underlined in [TensorRT Repository](https://github.com/NVIDIA/TensorRT/tree/master/demo/BERT) to build a TensorRT engine. Place it as `results/triton_models/<triton_model_name>/<triton_version_name>/model.plan` and use the `run_triton_trt.sh` script as follows.
 
-Note: We compare BS=1, Client Concurrent Requests = 64 to BS=8, Client Concurrent Requests = 8 to keep the Total Number of Outstanding Requests equal between the two different modes. Where Total Number of Outstanding Requests = Batch Size * Client Concurrent Requests. This is also why there are 8 times as many bars on the BS=1 chart than the BS=8 chart.
+```bash
+bash triton/scripts/run_triton_trt.sh <batch_size> <seq_length> <doc_stride> <bert_model> <squad_version> <triton_version_name> <triton_model_name>
 
-Increasing the batch size from 1 to 8 results in an increase in compute time by 1.8x (8.38ms to 15.46ms) showing that computation is more efficient at higher batch sizes. Hence, an optimal batch size would be the maximum batch size that can both fit in memory and is within the preferred latency threshold.
+```
 
-![](../data/images/triton_bs_1.png?raw=true)
+Notes:
 
-Figure 3: Latency & Throughput vs Concurrency at Batch size = 1
+-   [Triton Inference Server 20.09](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel_20-09.html#rel_20-09) is compatible with [TensorRT 7.1.3](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html).
 
-![](../data/images/triton_bs_8.png?raw=true)
 
-Figure 4: Latency & Throughput vs Concurrency at Batch size = 8
+-   The current Triton Inference Server works with the TensorRT engine with `batch_size > 1`.
 
-#### Batching techniques
 
-Static batching is a feature of the inference server that allows inference requests to be served as they are received. It is preferred in scenarios where low latency is desired at the cost of throughput when the GPU is under utilized.
+-   To use the performance client with dynamic batching, build an engine with -b <N> -b <N+1>` to support dynamic batches upto size N.
 
-Dynamic batching is a feature of the inference server that allows inference requests to be combined by the server, so that a batch is created dynamically, resulting in an increased throughput. It is preferred in scenarios where we would like to maximize throughput and GPU utilization at the cost of higher latencies. You can set the [Dynamic Batcher parameters](https://docs.nvidia.com/deeplearning/sdk/triton-inference-server-master-branch-guide/docs/model_configuration.html#dynamic-batcher) `max_queue_delay_microseconds` to indicate the maximum amount of time you are willing to wait and ‘preferred_batchsize’ to indicate your optimal batch sizes in the Triton Inference Server model config.
 
-Figures 5 and 6 emphasize the increase in overall throughput with dynamic batching. At low numbers of concurrent requests, the increased throughput comes at the cost of increasing latency as the requests are queued up to `max_queue_delay_microseconds`. The effect of `preferred_batchsize` for dynamic batching is visually depicted by the dip in Server Queue time at integer multiples of the preferred batch sizes. At higher numbers of concurrent requests, observe that the throughput approach a maximum limit as we saturate the GPU utilization.
+## Advanced
+
+The following sections provide greater details about the Triton Inference Server pipeline and inference analysis and benchmarking results.
+
+### Running the Triton Inference Server
+
+Launch the Triton Inference Server in detached mode to run in background by default. To run in the foreground interactively, for debugging purposes, run:
+
+```bash
+DETACHED=”-it” bash scripts/docker/launch_server.sh
+
+```
+
+The script mounts and loads models at `$PWD/results/triton_models` to the server with all visible GPUs. In order to selectively choose the devices, set `NVIDIA_VISIBLE_DEVICES`.
+
+### Running the Triton Inference Client
+
+*Real data*
+
+In order to run the client with real data, run:
+
+```bash
+bash triton/scripts/run_client.sh <batch_size> <seq_length> <doc_stride> <triton_version_name> <triton_model_name> <BERT_DIR> <ADDITIONAL_ARGS>
+
+```
+
+The script calls `triton/run_squad_triton_client.py` which preprocesses data and sends/receives requests to/from the server.
+
+`ADDITIONAL_ARGS` must include either `--predict_file` to use the SQuAD dataset or a sample by passing `--question` and `--context`. Append with `--trt_engine` if running inference on a TensorRT engine server.
+
+*Synthetic data*
+
+In order to run the client with synthetic data for performance measurements, run:
+
+```bash
+bash triton/scripts/run_perf_client.sh <model_name> <model_version> <batch_size> <max_latency> <max_client_threads> <max_concurrency> <server_hostname>
+
+```
+
+The script waits until the server is up and running, sends requests as per the constraints set and writes results to `OUTPUT_FILE_CSV="/results/perf_client/${MODEL_NAME}/results_${TIMESTAMP}.csv`.
+
+For more information about `perf_client`, refer to the [official documentation](https://docs.nvidia.com/deeplearning/triton-inference-server/master-user-guide/docs/optimization.html#perf-client).
+
+## Performance
+
+### Latency vs Throughput for TensorRT Engine
+
+Performance numbers for BERT Large, sequence length=384 are obtained from [experiments]([https://github.com/NVIDIA/TensorRT/tree/release/7.1/demo/BERT#inference-performance-nvidia-a100-40gb](https://github.com/NVIDIA/TensorRT/tree/release/7.1/demo/BERT#inference-performance-nvidia-a100-40gb)) on NVIDIA A100 with 1x A100 40G GPUs. Throughput is measured in samples/second, and latency in milliseconds.
+
+![](../data/images/bert_trt_throughput_vs_latency.png?raw=true)
+
+The plot above shows that throughput gains taper off from increasing batch size above 12. There is minimal gain in throughput going from batch size 12 to 128. However, running inference with a single large batch might be faster than running several small inference requests. Therefore, we choose to maximize batch size for Dynamic Batching with a maximum acceptable queuing delay of 50ms and maximum acceptable inference latency of 100ms.
+
+### Dynamic Batching Support
+
+The Triton server has a dynamic batching mechanism built in, that can be enabled. When it is enabled, the server creates
+
+inference batches from the received requests. With dynamic batching enabled, the server will concatenate requests that come in within maximum queue delay time, into a single inference batch. To configure these parameters and run dynamic batching for a model, issue:  
+  
+
+```bash
+#Set server config for dynamic batching with maximum queue delay  
+echo "dynamic_batching { max_queue_delay_microseconds: 1000 }" >> results/triton_models/bert/config.pbtxt  
+
+#Launch Server
+DETACHED="-it" bash triton/scripts/launch_server.sh
+
+
+#Run perf client in another terminal
+bash triton/scripts/run_perf_client.sh bert 1 12
+
+```
 
-![](../data/images/triton_static.png?raw=true)
+Note that the TensorRT engine takes 30+ minutes to build depending on the profile size. Loading it on TRITON server can be sped up by only loading only on required GPUs.
 
-Figure 5: Latency & Throughput vs Concurrency using Static Batching at `Batch size` = 1
+Performance results on a single A100 40G for various numbers of simultaneous requests are shown in the figure below.
 
-![](../data/images/triton_dynamic.png?raw=true)
+![](../data/images/bert_triton_dynamic_batching_a100.png?raw=true)
 
-Figure 6: Latency & Throughput vs Concurrency using Dynamic Batching at `Batch size` = 1, `preferred_batchsize` = [4, 8] and `max_queue_delay_microseconds` = 5000
+The plot above shows that if we have a 100ms upper bound on latency, then a single GPU can handle up to 9 concurrent requests before throughput saturates. This leads to total throughput of ~1045 sequences per second.
 
-#### Model execution instance count
+## Release Notes
 
-Triton Inference Server enables us to launch multiple engines in separate CUDA streams by setting the `instance_group_count` parameter to improve both latency and throughput. Multiple engines are useful when the model doesn’t saturate the GPU allowing the GPU to run multiple instances of the model in parallel.
+### Changelog
 
-Figures 7 and 8 show a drop in queue time as more models are available to serve an inference request. However, this is countered by an increase in compute time as multiple models compete for resources. Since BERT is a large model which utilizes the majority of the GPU, the benefit to running multiple engines is not seen.
 
-![](../data/images/triton_ec_1.png?raw=true)
+October 2020
+Add scripts to use TensorRT engines for inference
 
-Figure 7: Latency & Throughput vs Concurrency at Batch size = 1, Engine Count = 1
-(One copy of the model loaded in GPU memory)
+September 2020
+Update to TRITON 20.08
 
-![](../data/images/triton_ec_4.png?raw=true)
+April 2020
+TRTIS -> TRITON
 
-Figure 8: Latency & Throughput vs Concurrency at Batch size = 1, Engine count = 4
-(Four copies the model loaded in GPU memory)
+October 2019
+Initial release

+ 46 - 25
TensorFlow/LanguageModeling/BERT/triton/run_squad_triton_client.py

@@ -32,10 +32,6 @@ flags = tf.flags
 FLAGS = flags.FLAGS
 
 ## Required parameters
-flags.DEFINE_string(
-    "bert_config_file", None,
-    "The config json file corresponding to the pre-trained BERT model. "
-    "This specifies the model architecture.")
 
 flags.DEFINE_string("vocab_file", None,
                     "The vocabulary file that the BERT model was trained on.")
@@ -86,6 +82,9 @@ flags.DEFINE_bool(
     "verbose_logging", False,
     "If true, all of the warnings related to data processing will be printed. "
     "A number of warnings are expected for a normal SQuAD evaluation.")
+flags.DEFINE_bool(
+    "trt_engine", False,
+    "If true, expects a trt engine defined input/output")
 
 # Triton Specific flags
 flags.DEFINE_string("triton_model_name", "bert", "exports to appropriate directory for Triton")
@@ -127,6 +126,13 @@ def batch(iterable, n=1):
             input_ids_data = input_ids_data+ (np.array(iterable[ndx + i].input_ids, dtype=np.int32),)
             input_mask_data = input_mask_data+ (np.array(iterable[ndx + i].input_mask, dtype=np.int32),)
             segment_ids_data = segment_ids_data+ (np.array(iterable[ndx + i].segment_ids, dtype=np.int32),)
+        if FLAGS.trt_engine and len(label_ids_data) != n: #TRT needs exact batch size. Pad as necessary
+            pad_size = n - len(label_ids_data)
+            label_ids_data = label_ids_data + ((np.array([0], dtype=np.int32),) * pad_size)
+            input_ids_data = input_ids_data + ((np.zeros(FLAGS.max_seq_length, dtype=np.int32),) * pad_size)
+            input_mask_data = input_mask_data + ((np.zeros(FLAGS.max_seq_length, dtype=np.int32),) * pad_size)
+            segment_ids_data = segment_ids_data + ((np.zeros(FLAGS.max_seq_length, dtype=np.int32),) * pad_size)
+
 
         inputs_dict = {label_id_key: label_ids_data,
                        'input_ids': input_ids_data,
@@ -144,19 +150,23 @@ def main(_):
     """
     os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM
 
+    tf.compat.v1.logging.info("***** Configuaration *****")
+    for key in FLAGS.__flags.keys():
+      tf.compat.v1.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
+    tf.compat.v1.logging.info("**************************")
+
     tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
 
     # Get the Data
-    if FLAGS.predict_file:
-        eval_examples = read_squad_examples(
-            input_file=FLAGS.predict_file, is_training=False,
-            version_2_with_negative=FLAGS.version_2_with_negative)
-    elif FLAGS.question and FLAGS.answer:
+    if FLAGS.question and FLAGS.context:
         input_data = [{"paragraphs":[{"context":FLAGS.context,
                         "qas":[{"id":0, "question":FLAGS.question}]}]}]
-
         eval_examples = read_squad_examples(input_file=None, is_training=False,
             version_2_with_negative=FLAGS.version_2_with_negative, input_data=input_data)
+    elif FLAGS.predict_file:
+        eval_examples = read_squad_examples(
+            input_file=FLAGS.predict_file, is_training=False,
+            version_2_with_negative=FLAGS.version_2_with_negative)
     else:
         raise ValueError("Either predict_file or question+answer need to defined")
 
@@ -166,7 +176,7 @@ def main(_):
         eval_features.append(feature)
 
     convert_examples_to_features(
-        examples=eval_examples[0:],
+        examples=eval_examples,
         tokenizer=tokenizer,
         max_seq_length=FLAGS.max_seq_length,
         doc_stride=FLAGS.doc_stride,
@@ -217,10 +227,17 @@ def main(_):
         time_list.append(stop - start_time)
 
         batch_count = len(inputs[label_id_key])
-
-        start_logits_results = result.as_numpy("start_logits")
-        end_logits_results = result.as_numpy("end_logits")
-
+        if FLAGS.trt_engine:
+            cls_squad_logits = result.as_numpy("cls_squad_logits")
+            try: #when batch size > 1
+                start_logits_results = np.array(cls_squad_logits.squeeze()[:, :, 0])
+                end_logits_results = np.array(cls_squad_logits.squeeze()[:, :, 1])
+            except:
+                start_logits_results = np.expand_dims(np.array(cls_squad_logits.squeeze()[:, 0]), axis=0)
+                end_logits_results = np.expand_dims(np.array(cls_squad_logits.squeeze()[:, 1]), axis=0)
+        else:
+            start_logits_results = result.as_numpy("start_logits")
+            end_logits_results = result.as_numpy("end_logits")
         for i in range(batch_count):
             unique_id = int(inputs[label_id_key][i][0])
             start_logits = [float(x) for x in start_logits_results[i].flat]
@@ -232,7 +249,7 @@ def main(_):
                     end_logits=end_logits))
 
         recv_prog.update(n=batch_count)
-       	return outstanding
+        return outstanding
 
     all_results = []
     time_list = []
@@ -245,24 +262,29 @@ def main(_):
 
         present_batch_size = len(inputs_dict[label_id_key])
 
-        label_ids_data = np.stack(inputs_dict[label_id_key])
+        if not FLAGS.trt_engine:
+            label_ids_data = np.stack(inputs_dict[label_id_key])
         input_ids_data = np.stack(inputs_dict['input_ids'])
         input_mask_data = np.stack(inputs_dict['input_mask'])
         segment_ids_data = np.stack(inputs_dict['segment_ids'])
 
         inputs = []
-        inputs.append(tritongrpcclient.InferInput(label_id_key, label_ids_data.shape, "INT32"))
-        inputs[0].set_data_from_numpy(label_ids_data)
         inputs.append(tritongrpcclient.InferInput('input_ids', input_ids_data.shape, "INT32"))
-        inputs[1].set_data_from_numpy(input_ids_data)
+        inputs[0].set_data_from_numpy(input_ids_data)
         inputs.append(tritongrpcclient.InferInput('input_mask', input_mask_data.shape, "INT32"))
-        inputs[2].set_data_from_numpy(input_mask_data)
+        inputs[1].set_data_from_numpy(input_mask_data)
         inputs.append(tritongrpcclient.InferInput('segment_ids', segment_ids_data.shape, "INT32"))
-        inputs[3].set_data_from_numpy(segment_ids_data)
+        inputs[2].set_data_from_numpy(segment_ids_data)
+        if not FLAGS.trt_engine:
+            inputs.append(tritongrpcclient.InferInput(label_id_key, label_ids_data.shape, "INT32"))
+            inputs[3].set_data_from_numpy(label_ids_data)
 
         outputs = []
-        outputs.append(tritongrpcclient.InferRequestedOutput('start_logits'))
-        outputs.append(tritongrpcclient.InferRequestedOutput('end_logits'))
+        if FLAGS.trt_engine:
+            outputs.append(tritongrpcclient.InferRequestedOutput('cls_squad_logits'))
+        else:
+            outputs.append(tritongrpcclient.InferRequestedOutput('start_logits'))
+            outputs.append(tritongrpcclient.InferRequestedOutput('end_logits'))
 
         start_time = time.time()
         triton_client.async_infer(
@@ -339,6 +361,5 @@ def main(_):
 
 if __name__ == "__main__":
   flags.mark_flag_as_required("vocab_file")
-  flags.mark_flag_as_required("bert_config_file")
   tf.compat.v1.app.run()
 

+ 2 - 3
TensorFlow/LanguageModeling/BERT/triton/scripts/export_model.sh

@@ -12,14 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-init_checkpoint=${1:-"/results/models/bert_large_fp16_384_v1/model.ckpt-5474"}
+init_checkpoint=${1:-"data/download/nvidia_pretrained/bert_tf_squad11_large_384/model.ckpt"}
 batch_size=${2:-"8"}
 precision=${3:-"fp16"}
 use_xla=${4:-"true"}
 seq_length=${5:-"384"}
 doc_stride=${6:-"128"}
-BERT_DIR=${7:-"data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16"}
+BERT_DIR=${7:-"data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb"}
 triton_model_version=${8:-1}
 triton_model_name=${9:-"bert"}
 triton_dyn_batching_delay=${10:-0}

+ 0 - 115
TensorFlow/LanguageModeling/BERT/triton/scripts/generate_figures.sh

@@ -1,115 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Set the number of devices to use
-export NVIDIA_VISIBLE_DEVICES=0
-
-# Always need to be overwriting models to keep memory use low
-export TRITON_MODEL_OVERWRITE=True
-
-bert_model=${1:-small}
-seq_length=${2:-128}
-precision=${3:-fp16}
-init_checkpoint=${4:-"/results/models/bert_${bert_model}_${precision}_${seq_length}_v1/model.ckpt-5474"}
-
-MODEL_NAME="bert_${bert_model}_${seq_length}_${precision}"
-
-if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
-else
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
-fi
-
-doc_stride=128
-use_xla=true
-EXPORT_MODEL_ARGS="${precision} ${use_xla} ${seq_length} ${doc_stride} ${BERT_DIR} 1 ${MODEL_NAME}"
-PERF_CLIENT_ARGS="1000 10 20 localhost"
-
-############## Dynamic Batching Comparison ##############
-SERVER_BATCH_SIZE=8
-CLIENT_BATCH_SIZE=1
-TRITON_ENGINE_COUNT=1
-
-# Dynamic batching 10 ms
-TRITON_DYN_BATCHING_DELAY=10
-bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
-
-# Dynamic batching 5 ms
-TRITON_DYN_BATCHING_DELAY=5
-bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
-
-# Dynamic batching 2 ms
-TRITON_DYN_BATCHING_DELAY=2
-bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
-
-
-# Static Batching (i.e. Dynamic batching 0 ms)
-TRITON_DYN_BATCHING_DELAY=0
-bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
-
-
-# ############## Engine Count Comparison ##############
-SERVER_BATCH_SIZE=1
-CLIENT_BATCH_SIZE=1
-TRITON_DYN_BATCHING_DELAY=0
-
-# Engine Count = 4
-TRITON_ENGINE_COUNT=4
-bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
-
-# Engine Count = 2
-TRITON_ENGINE_COUNT=2
-bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
-
-# Engine Count = 1
-TRITON_ENGINE_COUNT=1
-bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
-
-
-############## Batch Size Comparison ##############
-# BATCH=1 Generate model and perf
-SERVER_BATCH_SIZE=1
-CLIENT_BATCH_SIZE=1
-TRITON_ENGINE_COUNT=1
-TRITON_DYN_BATCHING_DELAY=0
-
-bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 64 localhost
-
-# BATCH=2 Generate model and perf
-SERVER_BATCH_SIZE=2
-CLIENT_BATCH_SIZE=2
-bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 32 localhost
-
-# BATCH=4 Generate model and perf
-SERVER_BATCH_SIZE=4
-CLIENT_BATCH_SIZE=4
-bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 16 localhost
-
-# BATCH=8 Generate model and perf
-SERVER_BATCH_SIZE=8
-CLIENT_BATCH_SIZE=8
-bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 8 localhost
-

+ 4 - 3
TensorFlow/LanguageModeling/BERT/triton/scripts/launch_server.sh

@@ -1,7 +1,8 @@
 NV_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-"all"}
+DETACHED=${DETACHED:-"-d"}
 
-# Start TRITON server in detached state
-docker run --gpus $NV_VISIBLE_DEVICES --rm -d \
+# Start TRITON server in DETACHED state
+docker run --gpus $NV_VISIBLE_DEVICES --rm $DETACHED \
    --shm-size=1g \
    --ulimit memlock=-1 \
    --ulimit stack=67108864 \
@@ -11,4 +12,4 @@ docker run --gpus $NV_VISIBLE_DEVICES --rm -d \
    --name triton_server_cont \
    -e NVIDIA_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES \
    -v $PWD/results/triton_models:/models \
-   nvcr.io/nvidia/tritonserver:20.06-py3 tritonserver --model-store=/models --strict-model-config=false
+   nvcr.io/nvidia/tritonserver:20.09-py3 tritonserver --model-store=/models --strict-model-config=false --log-verbose=1

+ 1 - 2
TensorFlow/LanguageModeling/BERT/triton/scripts/run_client.sh

@@ -18,14 +18,13 @@ seq_length=${2:-"384"}
 doc_stride=${3:-"128"}
 triton_version_name=${4:-"1"}
 triton_model_name=${5:-"bert"}
-BERT_DIR=${6:-"data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16"}
+BERT_DIR=${6:-"data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb"}
 
 bash scripts/docker/launch.sh \
    "python triton/run_squad_triton_client.py \
       --triton_model_name=$triton_model_name \
       --triton_model_version=$triton_version_name \
       --vocab_file=$BERT_DIR/vocab.txt \
-      --bert_config_file=$BERT_DIR/bert_config.json \
       --predict_batch_size=$batch_size \
       --max_seq_length=${seq_length} \
       --doc_stride=${doc_stride} \

+ 7 - 8
TensorFlow/LanguageModeling/BERT/triton/scripts/run_perf_client.sh

@@ -15,12 +15,11 @@
 
 MODEL_NAME=${1:-"bert"}
 MODEL_VERSION=${2:-1}
-precision=${3:-"fp16"}
-BATCH_SIZE=${4:-1}
-MAX_LATENCY=${5:-500}
-MAX_CLIENT_THREADS=${6:-10}
-MAX_CONCURRENCY=${7:-50}
-SERVER_HOSTNAME=${8:-"localhost"}
+BATCH_SIZE=${3:-1}
+MAX_LATENCY=${4:-100}
+MAX_CLIENT_THREADS=${5:-10}
+MAX_CONCURRENCY=${6:-50}
+SERVER_HOSTNAME=${7:-"localhost"}
 
 if [[ $SERVER_HOSTNAME == *":"* ]]; then
   echo "ERROR! Do not include the port when passing the Server Hostname. These scripts require that the TRITON HTTP endpoint is on Port 8000 and the gRPC endpoint is on Port 8001. Exiting..."
@@ -58,9 +57,9 @@ ARGS="\
    --max-threads ${MAX_CLIENT_THREADS} \
    -m ${MODEL_NAME} \
    -x ${MODEL_VERSION} \
-   -p 3000 \
+   -p 200000 \
    -d \
-   -v \
+   -v -z \
    -i gRPC \
    -u ${SERVER_HOSTNAME}:8001 \
    -b ${BATCH_SIZE} \

+ 7 - 15
TensorFlow/LanguageModeling/BERT/triton/scripts/run_triton.sh → TensorFlow/LanguageModeling/BERT/triton/scripts/run_triton_tf.sh

@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-init_checkpoint=${1:-"/results/models/bert_large_fp16_384_v1/model.ckpt-5474"}
+init_checkpoint=${1:-"data/download/nvidia_pretrained/bert_tf_squad11_large_384/model.ckpt"}
 batch_size=${2:-"8"}
 precision=${3:-"fp16"}
 use_xla=${4:-"true"}
@@ -27,18 +27,13 @@ triton_export_model=${11:-"true"}
 triton_dyn_batching_delay=${12:-0}
 triton_engine_count=${13:-1}
 triton_model_overwrite=${14:-"False"}
-squad_version=${15:-"1.1"}
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
 else
-    export BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_squad11_base_128
 fi
 
-if [ ! -d "$BERT_DIR" ] ; then
-   echo "Error! $BERT_DIR directory missing. Please mount pretrained BERT dataset."
-   exit -1
-fi
 
 export SQUAD_DIR=data/download/squad/v${squad_version}
 if [ "$squad_version" = "1.1" ] ; then
@@ -47,12 +42,6 @@ else
     version_2_with_negative="True"
 fi
 
-echo "Squad directory set as " $SQUAD_DIR
-if [ ! -d "$SQUAD_DIR" ] ; then
-   echo "Error! $SQUAD_DIR directory missing. Please mount SQuAD dataset."
-   exit -1
-fi
-
 # Need to ignore case on some variables
 triton_export_model=$(echo "$triton_export_model" | tr '[:upper:]' '[:lower:]')
 
@@ -63,6 +52,7 @@ echo " BERT directory set as " $BERT_DIR
 echo
 echo "Argument: "
 echo "   init_checkpoint = $init_checkpoint"
+echo "   Using TRT engine= $use_trt_model"
 echo "   batch_size      = $batch_size"
 echo "   precision       = $precision"
 echo "   use_xla         = $use_xla"
@@ -78,8 +68,10 @@ echo "Env: "
 echo "   NVIDIA_VISIBLE_DEVICES = $NV_VISIBLE_DEVICES"
 echo
 
+
 # Export Model in SavedModel format if enabled
 if [ "$triton_export_model" = "true" ] ; then
+
    echo "Exporting model as: Name - $triton_model_name Version - $triton_version_name"
 
       bash triton/scripts/export_model.sh $init_checkpoint $batch_size $precision $use_xla $seq_length \
@@ -95,7 +87,7 @@ bash triton/scripts/wait_for_triton_server.sh localhost
 
 # Start TRTIS client for inference on SQuAD Dataset
 bash triton/scripts/run_client.sh $batch_size $seq_length $doc_stride $triton_version_name $triton_model_name \
-    $BERT_DIR --predict_file=$SQUAD_DIR/dev-v${squad_version}.json --version_2_with_negative=${version_2_with_negative}
+    $BERT_DIR --version_2_with_negative=${version_2_with_negative} --predict_file=$SQUAD_DIR/dev-v${squad_version}.json 
 
 # Evaluate SQuAD results
 bash scripts/docker/launch.sh "python $SQUAD_DIR/evaluate-v${squad_version}.py \

+ 82 - 0
TensorFlow/LanguageModeling/BERT/triton/scripts/run_triton_trt.sh

@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+batch_size=${1:-"8"}
+seq_length=${2:-"384"}
+doc_stride=${3:-"128"}
+bert_model=${4:-"large"}
+squad_version=${5:-"1.1"}
+triton_version_name=${6:-1}
+triton_model_name=${7:-"bert"}
+
+if [ "$bert_model" = "large" ] ; then
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
+else
+    export BERT_DIR=data/download/nvidia_pretrained/bert_tf_squad11_base_128
+fi
+
+if [ ! -d "$BERT_DIR" ] ; then
+   echo "Error! $BERT_DIR directory missing. Please mount pretrained BERT dataset."
+   exit -1
+fi
+
+export SQUAD_DIR=data/download/squad/v${squad_version}
+if [ "$squad_version" = "1.1" ] ; then
+    version_2_with_negative="False"
+else
+    version_2_with_negative="True"
+fi
+
+echo "Squad directory set as " $SQUAD_DIR
+if [ ! -d "$SQUAD_DIR" ] ; then
+   echo "Error! $SQUAD_DIR directory missing. Please mount SQuAD dataset."
+   exit -1
+fi
+
+# Explicitly save this variable to pass down to new containers
+NV_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-"all"}
+
+echo " BERT directory set as " $BERT_DIR
+echo
+echo "Argument: "
+echo "   init_checkpoint = $init_checkpoint"
+echo "   batch_size      = $batch_size"
+echo "   seq_length      = $seq_length"
+echo "   doc_stride      = $doc_stride"
+echo "   squad_version   = $squad_version"
+echo "   version_name    = $triton_version_name"
+echo "   model_name      = $triton_model_name"
+echo
+echo "Env: "
+echo "   NVIDIA_VISIBLE_DEVICES = $NV_VISIBLE_DEVICES"
+echo
+
+
+# Start TRTIS server in detached state
+bash triton/scripts/launch_server.sh
+
+# Wait until server is up. curl on the health of the server and sleep until its ready
+bash triton/scripts/wait_for_triton_server.sh localhost
+
+# Start TRTIS client for inference on SQuAD Dataset
+bash triton/scripts/run_client.sh $batch_size $seq_length $doc_stride $triton_version_name $triton_model_name \
+    $BERT_DIR --version_2_with_negative=${version_2_with_negative} --trt_engine --predict_file=$SQUAD_DIR/dev-v${squad_version}.json 
+
+# Evaluate SQuAD results
+bash scripts/docker/launch.sh "python $SQUAD_DIR/evaluate-v${squad_version}.py \
+    $SQUAD_DIR/dev-v${squad_version}.json /results/predictions.json"
+
+#Kill the TRTIS Server
+docker kill triton_server_cont

+ 47 - 0
TensorFlow/LanguageModeling/BERT/triton/scripts/triton_data_download.sh

@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export BERT_PREP_WORKING_DIR="${BERT_PREP_WORKING_DIR}"
+
+#SQuAD
+if [ ! -d /workspace/bert/data/download/squad ]; then
+	echo "Downloading SQuAD"
+	python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
+	mkdir -p /workspace/bert/data/download/nvidia_pretrained
+fi
+
+#SQuAD Large Checkpoint
+if [ ! -d /workspace/bert/data/download/nvidia_pretrained/bert_tf_squad11_large_384 ]; then
+	echo "Downloading SQuAD Large Checkpoint"
+	cd /workspace/bert/data/download/nvidia_pretrained && \
+		wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_ckpt_large_qa_squad11_amp_384/versions/19.03.1/zip -O bert_tf_ckpt_large_qa_squad11_amp_384_19.03.1.zip \
+		 && unzip bert_tf_ckpt_large_qa_squad11_amp_384_19.03.1.zip -d bert_tf_squad11_large_384 && rm bert_tf_ckpt_large_qa_squad11_amp_384_19.03.1.zip
+fi
+
+#SQuAD Base Checkpoint
+if [ ! -d /workspace/bert/data/download/nvidia_pretrained/bert_tf_squad11_base_128 ]; then
+	echo "Downloading SQuAD Base Checkpoint"
+	cd /workspace/bert/data/download/nvidia_pretrained && \
+		wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_ckpt_base_qa_squad11_amp_128/versions/19.03.1/zip -O bert_tf_ckpt_base_qa_squad11_amp_128_19.03.1.zip \
+		 && unzip bert_tf_ckpt_base_qa_squad11_amp_128_19.03.1.zip -d bert_tf_squad11_base_128 && rm bert_tf_ckpt_base_qa_squad11_amp_128_19.03.1.zip
+fi
+
+#Pretraining Large checkpoint
+if [ ! -d /workspace/bert/data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb ]; then
+	echo "Downloading Pretraining Large Checkpoint"
+	cd /workspace/bert/data/download/nvidia_pretrained && \
+		wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_ckpt_large_pretraining_amp_lamb/versions/19.03.1/zip -O bert_tf_ckpt_large_pretraining_amp_lamb_19.03.1.zip \
+		&& unzip bert_tf_ckpt_large_pretraining_amp_lamb_19.03.1.zip -d bert_tf_pretraining_large_lamb && rm bert_tf_ckpt_large_pretraining_amp_lamb_19.03.1.zip
+fi

+ 2 - 1
TensorFlow/LanguageModeling/BERT/triton/scripts/wait_for_triton_server.sh

@@ -30,4 +30,5 @@ while [[ ${current_status} != "200" ]] || [[ $($ready_command) != "200" ]]; do
    current_status=$($live_command)
 done
 
-echo "TRITON Server is ready!"
+echo "TRITON Server is ready!"
+

+ 0 - 2
TensorFlow/LanguageModeling/BERT/trt/.dockerignore

@@ -1,2 +0,0 @@
-engines/
-models/

+ 0 - 67
TensorFlow/LanguageModeling/BERT/trt/Dockerfile

@@ -1,67 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-FROM nvcr.io/nvidia/tensorrt:20.02-py3
-
-LABEL maintainer="NVIDIA CORPORATION"
-
-ARG uid=1000
-ARG gid=1000
-RUN groupadd -r -f -g ${gid} trtuser && useradd -r -u ${uid} -g ${gid} -ms /bin/bash trtuser
-RUN usermod -aG sudo trtuser
-RUN echo 'trtuser:nvidia' | chpasswd
-RUN chown trtuser /workspace
-
-# Install requried libraries
-RUN apt-get update && apt-get install -y software-properties-common
-RUN add-apt-repository ppa:ubuntu-toolchain-r/test
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    libcurl4-openssl-dev \
-    wget \
-    zlib1g-dev \
-    git \
-    pkg-config \
-    python3 \
-    python3-pip \
-    python3-dev \
-    python3-setuptools \
-    python3-wheel \
-    sudo \
-    pbzip2 \
-    pv \
-    bzip2 \
-    unzip
-
-# Install Cmake
-RUN cd /tmp && \
-    wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \
-    chmod +x cmake-3.14.4-Linux-x86_64.sh && \
-    ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \
-    rm ./cmake-3.14.4-Linux-x86_64.sh
-
-# Download NGC client
-RUN cd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_bat_linux.zip && unzip ngccli_bat_linux.zip && chmod u+x ngc && rm ngccli_bat_linux.zip ngc.md5 && echo "no-apikey\nascii\nno-org\nno-team\nno-ace\n" | ngc config set
-
-# Install required Python packages
-COPY requirements.txt /tmp/requirements.txt
-RUN python3 -m pip install -r /tmp/requirements.txt
-
-# Build with extra flags to enable plugins available on SM 70/75
-RUN git clone https://github.com/NVIDIA/TensorRT.git
-RUN cd TensorRT && git checkout 9b2d78a87e3d45eca22b6bfcb6e10777b932ef43 && git submodule update --init --recursive && mkdir build && cd build && cmake .. -DGPU_ARCHS="70 75" -DBUILD_PARSERS=OFF && make -j nvinfer_plugin
-# Replace the default plugin library with the open-source plugins
-RUN cp TensorRT/build/libnvinfer_plugin.so* /usr/lib/x86_64-linux-gnu/
-
-WORKDIR /workspace/bert
-USER trtuser

+ 0 - 353
TensorFlow/LanguageModeling/BERT/trt/README.md

@@ -1,353 +0,0 @@
-# BERT Inference Using TensorRT
-
-This subfolder of the BERT TensorFlow repository, tested and maintained by NVIDIA, provides scripts to perform high-performance inference using NVIDIA TensorRT.
-
-
-## Table Of Contents
-
-- [Model Overview](#model-overview)
-   * [Model Architecture](#model-architecture)
-   * [TensorRT Inference Pipeline](#tensorrt-inference-pipeline)
-   * [Version Info](#version-info)
-- [Setup](#setup)
-   * [Requirements](#requirements)
-- [Quick Start Guide](#quick-start-guide)
-   * [(Optional) Trying a different configuration](#optional-trying-a-different-configuration)
-- [Advanced](#advanced)
-   * [Scripts and sample code](#scripts-and-sample-code)
-   * [Command-line options](#command-line-options)
-   * [TensorRT inference process](#tensorrt-inference-process)
-- [Performance](#performance)
-   * [Benchmarking](#benchmarking)
-      * [TensorRT inference benchmark](#tensorrt-inference-benchmark)
-   * [Results](#results)
-      * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
-      * [BERT Base](#bert-base)
-      * [BERT Large](#bert-large)
-   * [Inference performance: NVIDIA V100 (32GB)](#inference-performance-nvidia-v100-(32gc))
-      * [BERT Base](#bert-base)
-      * [BERT Large](#bert-large)
-
-
-
-## Model overview
-
-BERT, or Bidirectional Encoder Representations from Transformers, is a new method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This model is based on the [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper. NVIDIA's BERT is an optimized version of [Google's official implementation](https://github.com/google-research/bert), leveraging mixed precision arithmetic and Tensor Cores for faster inference times while maintaining target accuracy.
-
-Other publicly available implementations of BERT include:
-1. [NVIDIA PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)
-2. [Hugging Face](https://github.com/huggingface/pytorch-pretrained-BERT)
-3. [codertimo](https://github.com/codertimo/BERT-pytorch)
-4. [gluon-nlp](https://github.com/dmlc/gluon-nlp/tree/master/scripts/bert)
-5. [Google's official implementation](https://github.com/google-research/bert)
-
-
-### Model architecture
-
-BERT's model architecture is a multi-layer bidirectional Transformer encoder. Based on the model size, we have the following two default configurations of BERT:
-
-| **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feed-forward filter size** | **Max sequence length** | **Parameters** |
-|:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|
-|BERT-Base |12 encoder| 768| 12|4 x  768|512|110M|
-|BERT-Large|24 encoder|1024| 16|4 x 1024|512|330M|
-
-Typically, the language model is followed by a few task-specific layers. The model used here includes layers for question answering.
-
-### TensorRT Inference Pipeline
-
-BERT inference consists of three main stages: tokenization, the BERT model, and finally a projection of the tokenized prediction onto the original text.
-Since the tokenizer and projection of the final predictions are not nearly as compute-heavy as the model itself, we run them on the host. The BERT model is GPU-accelerated via TensorRT.
-
-The tokenizer splits the input text into tokens that can be consumed by the model. For details on this process, see [this tutorial](https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/).
-
-To run the BERT model in TensorRT, we construct the model using TensorRT APIs and import the weights from a pre-trained TensorFlow checkpoint from [NGC](https://ngc.nvidia.com/models/nvidian:bert_tf_v2_large_fp16_128). Finally, a TensorRT engine is generated and serialized to the disk. The various inference scripts then load this engine for inference.
-
-Lastly, the tokens predicted by the model are projected back to the original text to get a final result.
-
-### Version Info
-
-The following software version configuration has been tested:
-
-|Software|Version|
-|--------|-------|
-|Python|3.6.9|
-|TensorFlow|1.13.1|
-|TensorRT|7.0.0.1|
-|CUDA|10.2.89|
-
-
-## Setup
-
-The following section lists the requirements that you need to meet in order to run the BERT model.
-
-### Requirements
-
-This repository contains a `Dockerfile` which extends the TensorRT NGC container and installs some dependencies. Ensure you have the following components:
-
-* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [TensorRT 20.02-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt)
-* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU with NVIDIA Driver 440.33.01 or later.
-
-Required Python packages are listed in `requirements.txt`. These packages are automatically installed inside the container.
-
-## Quick Start Guide
-
-1. Create and launch the BERT container:
-    ```bash
-    bash trt/scripts/build.sh && bash trt/scripts/launch.sh
-    ```
-
-    **Note:** After this point, all commands should be run from within the container.
-
-2. Download checkpoints for a pre-trained BERT model:
-    ```bash
-    bash scripts/download_model.sh
-    ```
-    This will download checkpoints for a BERT Large FP16 SQuAD v2 model with a sequence length of 128 by default.
-
-**Note:** Since the checkpoints are stored in the directory mounted from the host, they do *not* need to be downloaded each time the container is launched. 
-
-3. Build a TensorRT engine. To build an engine, run the `builder.py` script. For example:
-    ```bash
-    mkdir -p /workspace/bert/engines && python3 builder.py -m /workspace/bert/models/fine-tuned/bert_tf_v2_large_fp16_128_v2/model.ckpt-8144 -o /workspace/bert/engines/bert_large_128.engine -b 1 -s 128 --fp16 -c /workspace/bert/models/fine-tuned/bert_tf_v2_large_fp16_128_v2
-    ```
-
-    This will build an engine with a maximum batch size of 1 (`-b 1`), and sequence length of 128 (`-s 128`) using mixed precision (`--fp16`) using the BERT Large V2 FP16 Sequence Length 128 checkpoint (`-c /workspace/bert/models/fine-tuned/bert_tf_v2_large_fp16_128_v2`).
-
-4. Run inference. Two options are provided for running the model.
-
-    a. `inference.py` script
-    This script accepts a passage and question and then runs the engine to generate an answer. The vocabulary file used to train the source model is also specified (`-v /workspace/bert/models/fine-tuned/bert_tf_v2_large_fp16_128_v2/vocab.txt`).
-    For example:
-    ```bash
-    python3 inference.py -e /workspace/bert/engines/bert_large_128.engine -p "TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops and layers before applying optimizations for inference. Today NVIDIA is open-sourcing parsers and plugins in TensorRT so that the deep learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps." -q "What is TensorRT?" -v /workspace/bert/models/fine-tuned/bert_tf_v2_large_fp16_128_v2/vocab.txt
-    ```
-
-    b. `inference.ipynb` Jupyter Notebook
-    The Jupyter Notebook includes a passage and various example questions and allows you to interactively make modifications and see the outcome.
-    To launch the Jupyter Notebook from inside the container, run:
-    ```bash
-    jupyter notebook --ip 0.0.0.0 inference.ipynb
-    ```
-    Then, use your browser to open the link displayed. The link should look similar to: `http://127.0.0.1:8888/?token=<TOKEN>`
-
-
-### (Optional) Trying a different configuration
-
-If you would like to run another configuration, you can manually download checkpoints using the included script. For example, run:
-```bash
-bash scripts/download_model.sh base
-```
-to download a BERT Base model instead of the default BERT Large model.
-
-To view all available model options, run:
-```bash
-bash scripts/download_model.sh -h
-```
-
-## Advanced
-
-The following sections provide greater details on inference with TensorRT.
-
-### Scripts and sample code
-
-In the `root` directory, the most important files are:
-
-- `builder.py` - Builds an engine for the specified BERT model
-- `Dockerfile` - Container which includes dependencies and model checkpoints to run BERT
-- `inference.ipynb` - Runs inference interactively
-- `inference.py` - Runs inference with a given passage and question
-- `perf.py` - Runs inference benchmarks
-
-The `scripts/` folder encapsulates all the one-click scripts required for running various supported functionalities, such as:
-
-- `build.sh` - Builds a Docker container that is ready to run BERT
-- `launch.sh` - Launches the container created by the `build.sh` script.
-- `download_model.sh` - Downloads pre-trained model checkpoints from NGC
-- `inference_benchmark.sh` - Runs an inference benchmark and prints results
-
-Other folders included in the `root` directory are:
-
-- `helpers` - Contains helpers for tokenization of inputs
-
-### Command-line options
-
-To view the available parameters for each script, you can use the help flag (`-h`).
-
-### TensorRT inference process
-
-As mentioned in the [Quick Start Guide](#quick-start-guide), two options are provided for running inference:
-1. The `inference.py` script which accepts a passage and a question and then runs the engine to generate an answer. Alternatively, this script can be used to run inference on the Squad dataset.
-2. The `inference.ipynb` Jupyter Notebook which includes a passage and various example questions and allows you to interactively make modifications and see the outcome.
-
-## Accuracy
-
-### Evaluating Int8 Accuracy Using The SQuAD Dataset
-1.  Download checkpoints for a BERT Large FP32 SQuAD v1.1 model with a sequence length of 128 and 384:
-    ```bash
-    bash scripts/download_model.sh large fp32 128 v1_1
-    bash scripts/download_model.sh large fp32 384 v1_1
-    ```
-
-2. Build an engine:
-    ```bash
-    mkdir -p /workspace/bert/engines && python3 builder.py -m /workspace/bert/models/fine-tuned/bert_tf_v1_1_large_fp32_384_v2/model.ckpt-5474 -o /workspace/bert/engines/bert_large_384_int8mix.engine -b 1 -s 384 --int8 --fp16 --strict -c /workspace/bert/models/fine-tuned/bert_tf_v2_large_fp16_128_v2 --squad-json ./squad/dev-v1.1.json -v /workspace/bert/models/fine-tuned/bert_tf_v1_1_large_fp32_384_v2/vocab.txt --calib-num 100
-    ```
-
-    This will build and engine with a maximum batch size of 1 (`-b 1`), calibration dataset squad (`--squad-json ./squad/dev-v1.1.json`), calibration sentences number 100 (`--calib-num 100`), and sequence length of 128 (`-s 128`) using INT8 mixed precision computation where possible (`--int8 --fp16 --strict`).
-
-3. Run inference using the squad dataset, and evaluate the F1 score and exact match score:
-    ```bash
-    python3 inference.py -e /workspace/bert/engines/bert_large_384_int8mix.engine -s 384 -sq ./squad/dev-v1.1.json -v /workspace/bert/models/fine-tuned/bert_tf_v1_1_large_fp32_384_v2/vocab.txt -o ./predictions.json
-    python3 squad/evaluate-v1.1.py  squad/dev-v1.1.json  ./predictions.json 90
-    ```
-
-## Performance
-
-### Benchmarking
-
-The following section shows how to run benchmarks measuring the model performance in inference modes.
-
-#### TensorRT inference benchmark
-
-The inference benchmark is performed on a single GPU by the `inference_benchmark.sh` script, which takes the following steps for each set of model parameters:
-
-1. Downloads checkpoints and builds a TensorRT engine if it does not already exist.
-
-2. Run the inference benchmark, which performs a sweep across batch sizes (1-128) and sequence lengths (128, 384). In each configuration, 1 warm-up iteration is followed by 200 runs to measure and report the BERT inference latencies.
-
-**Note:** The time measurements do not include the time required to copy inputs to the device and copy outputs to the host.
-
-To run the inference benchmark script, run:
-```bash
-bash scripts/inference_benchmark.sh
-```
-
-Note: Some of the configurations in the benchmark script require 16GB of GPU memory. On GPUs with smaller amounts of memory, parts of the benchmark may fail to run.
-
-Also note that BERT Large engines, especially using mixed precision with large batch sizes and sequence lengths may take a couple hours to build.
-
-### Results
-
-The following sections provide details on how we achieved our performance and inference.
-
-#### Inference performance: NVIDIA T4 (16GB)
-
-Our results were obtained by running the `scripts/inference_benchmark.sh` training script in the container generated by the included Dockerfile on NVIDIA T4 with (1x T4 16G) GPUs.
-
-
-##### BERT Base
-
-| Sequence Length | Batch Size | TensorRT Mixed Precision Latency (ms) ||         | TensorRT FP32 Latency (ms) |           |         |
-|-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
-|                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 1.97 | 1.97 | 1.93 | 6.47 | 6.51 | 6.12 |
-| 128 | 2 | 2.94 | 2.99 | 2.86 | 11.55 | 11.84 | 11.25 |
-| 128 | 4 | 5.00 | 8.44 | 4.88 | 22.08 | 22.63 | 21.90 |
-| 128 | 8 | 10.57 | 11.55 | 9.78 | 43.74 | 43.97 | 42.83 |
-| 128 | 12 | 15.01 | 15.27 | 14.56 | 68.42 | 69.71 | 67.47 |
-| 128 | 16 | 21.64 | 22.92 | 19.12 | 90.90 | 97.17 | 88.47 |
-| 128 | 24 | 31 | 31.65 | 29.71 | 131.11 | 133.5 | 129.43 |
-| 128 | 32 | 41.27 | 43.65 | 38.54 | 178.45 | 182.65 | 176.77 |
-| 128 | 64 | 76.73 | 81.31 | 73.89 | 364.31 | 364.68 | 362.05 |
-| 128 | 128 | 151.95 | 152.35 | 150.54 | 672.25 | 673.02 | 669.60 |
-| 384 | 1 | 5.18 | 5.19 | 4.97 | 19.11 | 19.13 | 18.44 |
-| 384 | 2 | 9.82 | 9.92 | 9.51 | 37.5 | 38.31 | 36.93 |
-| 384 | 4 | 18.08 | 19.46 | 17.56 | 77.01 | 81.02 | 74.98 |
-| 384 | 8 | 37.32 | 37.94 | 36.77 | 147.05 | 148.85 | 145.27 |
-| 384 | 12 | 56.91 | 57.52 | 55.43 | 218.76 | 219.32 | 217.04 |
-| 384 | 16 | 73.35 | 76.45 | 71.76 | 302.05 | 303.38 | 299.29 |
-| 384 | 24 | 110.14 | 110.78 | 109.03 | 430.22 | 430.91 | 428.49 |
-| 384 | 32 | 140.05 | 140.92 | 138.61 | 618.31 | 619.78 | 613.26 |
-| 384 | 64 | 284.99 | 285.86 | 282.54 | 1218.55 | 1227.73 | 1215.81 |
-| 384 | 128 | 579.86 | 580.91 | 577.25 | 2325.91 | 2327.81 | 2319.26 |
-
-
-
-##### BERT Large
-
-| Sequence Length | Batch Size | TensorRT Mixed Precision Latency (ms) ||         | TensorRT FP32 Latency (ms) |           |         |
-|-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
-|                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 5.63 | 5.66 | 5.39 | 21.53 | 22.16 | 20.74 |
-| 128 | 2 | 9.11 | 9.83 | 8.89 | 40.31 | 40.45 | 39.24 |
-| 128 | 4 | 16.03 | 17.45 | 15.34 | 81.66 | 85.56 | 78.35 |
-| 128 | 8 | 33.2 | 33.98 | 32.59 | 145.86 | 146.2 | 144.46 |
-| 128 | 12 | 48.87 | 49.58 | 48.16 | 223.69 | 225.05 | 222.22 |
-| 128 | 16 | 64.48 | 68.01 | 62.60 | 289.42 | 292.36 | 286.33 |
-| 128 | 24 | 92.63 | 94.4 | 90.90 | 434.81 | 435.49 | 433.37 |
-| 128 | 32 | 121.63 | 125.25 | 118.14 | 611.33 | 612.58 | 604.69 |
-| 128 | 64 | 237.01 | 239.95 | 233.15 | 1231.35 | 1232.71 | 1220.68 |
-| 128 | 128 | 484.48 | 485.39 | 483.37 | 2338.03 | 2341.99 | 2316.32 |
-| 384 | 1 | 15.89 | 16.01 | 15.49 | 63.13 | 63.54 | 61.96 |
-| 384 | 2 | 30.1 | 30.2 | 29.56 | 121.37 | 122 | 120.19 |
-| 384 | 4 | 56.64 | 60.46 | 55.17 | 247.53 | 248.09 | 243.16 |
-| 384 | 8 | 114.53 | 115.74 | 112.91 | 485.92 | 486.85 | 484.55 |
-| 384 | 12 | 168.8 | 170.65 | 164.88 | 709.33 | 709.88 | 707.13 |
-| 384 | 16 | 217.53 | 218.89 | 214.36 | 1005.50 | 1007.29 | 992.56 |
-| 384 | 24 | 330.84 | 332.89 | 327.96 | 1489.48 | 1490.96 | 1480.36 |
-| 384 | 32 | 454.32 | 461.05 | 443.58 | 1986.66 | 1988.94 | 1976.53 |
-| 384 | 64 | 865.36 | 866.96 | 860.22 | 4029.11 | 4031.18 | 4015.06 |
-| 384 | 128 | 1762.72 | 1764.65 | 1756.79 | 7736.41 | 7739.45 | 7718.88 |
-
-
-
-#### Inference performance: NVIDIA V100 (32GB)
-
-Our results were obtained by running the `scripts/inference_benchmark.sh` training script in the container generated by the included Dockerfile on NVIDIA V100 with (1x V100 32G) GPUs.
-
-
-##### BERT Base
-
-| Sequence Length | Batch Size | TensorRT Mixed Precision Latency (ms) ||         | TensorRT FP32 Latency (ms) |           |         |
-|-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
-|                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 1.39 | 1.45 | 1.37 | 2.93 | 2.95 | 2.91 |
-| 128 | 2 | 1.63 | 1.63 | 1.62 | 4.65 | 4.68 | 4.62 |
-| 128 | 4 | 2.75 | 2.76 | 2.56 | 8.68 | 9.50 | 8.27 |
-| 128 | 8 | 3.58 | 3.59 | 3.55 | 15.56 | 15.63 | 15.42 |
-| 128 | 12 | 4.94 | 4.96 | 4.90 | 23.48 | 23.52 | 23.23 |
-| 128 | 16 | 7.86 | 7.90 | 7.01 | 30.23 | 30.29 | 29.87 |
-| 128 | 24 | 8.94 | 8.94 | 8.89 | 43.52 | 43.59 | 43.24 |
-| 128 | 32 | 13.25 | 13.59 | 13.11 | 56.45 | 56.79 | 56.10 |
-| 128 | 64 | 25.05 | 25.38 | 24.90 | 111.98 | 112.19 | 111.42 |
-| 128 | 128 | 46.31 | 46.38 | 46.01 | 219.6 | 220.3 | 219.22 |
-| 384 | 1 | 2.17 | 2.21 | 2.16 | 6.77 | 6.79 | 6.73 |
-| 384 | 2 | 3.39 | 3.46 | 3.38 | 13.12 | 13.16 | 13.04 |
-| 384 | 4 | 6.79 | 7.09 | 6.29 | 25.33 | 25.45 | 25.16 |
-| 384 | 8 | 10.84 | 10.86 | 10.78 | 47.94 | 48.16 | 47.65 |
-| 384 | 12 | 16.75 | 16.78 | 16.68 | 72.34 | 72.44 | 72.10 |
-| 384 | 16 | 22.66 | 23.28 | 22.56 | 94.65 | 94.93 | 94.08 |
-| 384 | 24 | 32.41 | 32.44 | 32.23 | 137.46 | 137.59 | 137.11 |
-| 384 | 32 | 44.29 | 44.34 | 44.02 | 186.96 | 187.06 | 185.85 |
-| 384 | 64 | 88.56 | 88.72 | 88.15 | 373.48 | 374.26 | 372.37 |
-| 384 | 128 | 165.93 | 166.14 | 165.34 | 739.52 | 740.65 | 737.33 |
-
-
-
-##### BERT Large
-
-| Sequence Length | Batch Size | TensorRT Mixed Precision Latency (ms) ||         | TensorRT FP32 Latency (ms) |           |         |
-|-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
-|                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 3.4 | 3.46 | 3.38 | 8.83 | 8.85 | 8.76 |
-| 128 | 2 | 4.15 | 4.17 | 4.13 | 14.53 | 14.58 | 14.42 |
-| 128 | 4 | 6.76 | 7.41 | 6.45 | 27.40 | 27.52 | 27.22 |
-| 128 | 8 | 11.34 | 11.35 | 11.25 | 53.22 | 53.35 | 53.11 |
-| 128 | 12 | 15.8 | 15.84 | 15.73 | 75.1 | 75.42 | 74.81 |
-| 128 | 16 | 21.64 | 22.27 | 21.50 | 102.64 | 102.71 | 101.92 |
-| 128 | 24 | 30.11 | 30.16 | 29.88 | 148.52 | 148.76 | 147.72 |
-| 128 | 32 | 40.42 | 40.54 | 40.05 | 203.56 | 203.65 | 202.22 |
-| 128 | 64 | 78.77 | 79.01 | 78.04 | 392.26 | 393.11 | 389.84 |
-| 128 | 128 | 149.32 | 149.69 | 148.55 | 793.46 | 795.62 | 789.83 |
-| 384 | 1 | 6.1 | 6.12 | 6.06 | 21.92 | 21.98 | 21.88 |
-| 384 | 2 | 10.16 | 10.18 | 10.08 | 42.47 | 42.52 | 42.35 |
-| 384 | 4 | 18.91 | 19.54 | 18.76 | 82.64 | 83.03 | 82.25 |
-| 384 | 8 | 35.15 | 35.18 | 34.97 | 164.88 | 164.98 | 164.07 |
-| 384 | 12 | 50.31 | 50.36 | 50.04 | 245.53 | 245.85 | 244.50 |
-| 384 | 16 | 69.46 | 69.89 | 69.04 | 321.36 | 321.71 | 318.98 |
-| 384 | 24 | 97.63 | 97.91 | 97.26 | 485.11 | 485.37 | 482.41 |
-| 384 | 32 | 135.16 | 135.70 | 134.39 | 636.32 | 637.40 | 632.66 |
-| 384 | 64 | 269.98 | 271.40 | 268.63 | 1264.41 | 1265.69 | 1261.08 |
-| 384 | 128 | 513.71 | 514.38 | 511.80 | 2503.02 | 2505.81 | 2499.51 |
-

+ 0 - 535
TensorFlow/LanguageModeling/BERT/trt/builder.py

@@ -1,535 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tensorrt as trt
-import ctypes
-import argparse
-import numpy as np
-import json
-import time
-import sys
-import re
-import os
-import os.path
-from helpers.calibrator import BertCalibrator as BertCalibrator
-
-try:
-    from tensorflow.python import pywrap_tensorflow as pyTF
-except ImportError as err:
-    sys.stderr.write("""Error: Failed to import tensorflow module ({})\n""".format(err))
-    sys.exit()
-
-"""
-TensorRT Initialization
-"""
-TRT_LOGGER = trt.Logger(trt.Logger.INFO)
-
-handle = ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
-if not handle:
-    raise RuntimeError("Could not load plugin library. Is `libnvinfer_plugin.so` on your LD_LIBRARY_PATH?")
-
-trt.init_libnvinfer_plugins(TRT_LOGGER, "")
-plg_registry = trt.get_plugin_registry()
-qkv2_plg_creator = plg_registry.get_plugin_creator("CustomQKVToContextPluginDynamic", "1", "")
-skln_plg_creator = plg_registry.get_plugin_creator("CustomSkipLayerNormPluginDynamic", "1", "")
-gelu_plg_creator = plg_registry.get_plugin_creator("CustomGeluPluginDynamic", "1", "")
-emln_plg_creator = plg_registry.get_plugin_creator("CustomEmbLayerNormPluginDynamic", "1", "")
-fc_plg_creator = plg_registry.get_plugin_creator("CustomFCPluginDynamic", "1", "")
-
-
-"""
-Attentions Keys
-"""
-WQ = "query_kernel"
-BQ = "query_bias"
-WK = "key_kernel"
-BK = "key_bias"
-WV = "value_kernel"
-BV = "value_bias"
-WQKV = "qkv_kernel"
-BQKV = "qkv_bias"
-
-
-"""
-Transformer Keys
-"""
-W_AOUT = "attention_output_dense_kernel"
-B_AOUT = "attention_output_dense_bias"
-AOUT_LN_BETA = "attention_output_layernorm_beta"
-AOUT_LN_GAMMA = "attention_output_layernorm_gamma"
-W_MID = "intermediate_dense_kernel"
-B_MID = "intermediate_dense_bias"
-W_LOUT = "output_dense_kernel"
-B_LOUT = "output_dense_bias"
-LOUT_LN_BETA = "output_layernorm_beta"
-LOUT_LN_GAMMA = "output_layernorm_gamma"
-
-
-"""
-Squad Output Keys
-"""
-SQD_W = "squad_output_weights"
-SQD_B = "squad_output_bias"
-
-class BertConfig:
-    def __init__(self, bert_config_path, use_fp16, use_int8, use_strict, use_fc2_gemm):
-        with open(bert_config_path, 'r') as f:
-            data = json.load(f)
-            self.num_attention_heads = data['num_attention_heads']
-            self.hidden_size = data['hidden_size']
-            self.intermediate_size = data['intermediate_size']
-            self.num_hidden_layers = data['num_hidden_layers']
-            self.use_fp16 = use_fp16
-            self.use_int8 = use_int8
-            self.use_fc2_gemm = use_fc2_gemm
-            self.use_strict = use_strict
-            self.head_size = self.hidden_size // self.num_attention_heads
-
-
-
-def set_tensor_name(tensor, prefix, name):
-    tensor.name = prefix + name
-
-def set_output_name(layer, prefix, name, out_idx = 0):
-    layer.name = prefix + name
-    set_tensor_name(layer.get_output(out_idx), prefix, name)
-
-def make_gelu_layer(prefix, config, network, input_tensor):
-    POW = network.add_constant((1, 1, 1, 1, 1), trt.Weights(np.ascontiguousarray([3.0], dtype=np.float32)))
-    MULTIPLY = network.add_constant((1, 1, 1, 1, 1), trt.Weights(np.ascontiguousarray([0.044715], dtype=np.float32)))
-    SQRT = network.add_constant((1, 1, 1, 1, 1), trt.Weights((np.ascontiguousarray([0.79788456080286535587989211986876], dtype=np.float32))))
-    ONE = network.add_constant((1, 1, 1, 1, 1), trt.Weights((np.ascontiguousarray([1.0], dtype=np.float32))))
-    HALF = network.add_constant((1, 1, 1, 1, 1), trt.Weights((np.ascontiguousarray([0.5], dtype=np.float32))))
-    X_pow = network.add_elementwise(input_tensor, POW.get_output(0), trt.ElementWiseOperation.POW)
-    X_pow_t = X_pow.get_output(0)
-    X_mul = network.add_elementwise(X_pow_t, MULTIPLY.get_output(0), trt.ElementWiseOperation.PROD)
-    X_add = network.add_elementwise(mid_dense_out, X_mul.get_output(0), trt.ElementWiseOperation.SUM)
-    X_sqrt = network.add_elementwise(X_add.get_output(0), SQRT.get_output(0), trt.ElementWiseOperation.PROD)
-    X_sqrt_tensor = X_sqrt.get_output(0)
-    X_tanh = network.add_activation(X_sqrt_tensor, trt.ActivationType.TANH)
-    X_tanh_tensor = X_tanh.get_output(0)
-    X_one = network.add_elementwise(X_tanh_tensor, ONE.get_output(0), trt.ElementWiseOperation.SUM)
-    CDF = network.add_elementwise(X_one.get_output(0), HALF.get_output(0), trt.ElementWiseOperation.PROD)
-    gelu_layer = network.add_elementwise(CDF.get_output(0), mid_dense_out, trt.ElementWiseOperation.PROD)
-
-    # enable elementwise fusing for int8 && fp16
-    POW.precision = trt.DataType.FLOAT
-    MULTIPLY.precision = trt.DataType.FLOAT
-    SQRT.precision = trt.DataType.FLOAT
-    ONE.precision = trt.DataType.FLOAT
-    HALF.precision = trt.DataType.FLOAT
-    X_pow.precision = trt.DataType.FLOAT
-    X_mul.precision = trt.DataType.FLOAT
-    X_add.precision = trt.DataType.FLOAT
-    X_sqrt.precision = trt.DataType.FLOAT
-    X_tanh.precision = trt.DataType.FLOAT
-    X_one.precision = trt.DataType.FLOAT
-    CDF.precision = trt.DataType.FLOAT
-    gelu_layer.precision = trt.DataType.FLOAT
-    return gelu_layer
-
-
-
-def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask):
-    """
-    Add the attention layer
-    """
-    assert(len(input_tensor.shape) == 5)
-    B, S, hidden_size, _, _ = input_tensor.shape
-    num_heads = config.num_attention_heads
-    head_size = int(hidden_size / num_heads)
-
-    Wall = init_dict[prefix + WQKV]
-    Ball = init_dict[prefix + BQKV]
-
-    # FC_attention
-    if config.use_int8:
-        mult_all = network.add_convolution(input_tensor, 3 * hidden_size, (1, 1), Wall, Ball)
-    else:
-        mult_all = network.add_fully_connected(input_tensor, 3 * hidden_size, Wall, Ball)
-
-    set_output_name(mult_all, prefix, "qkv_mult")
-
-    has_mask = imask is not None
-
-    pf_type = trt.PluginField("type_id", np.array([1 if config.use_fp16 else 0], np.int32), trt.PluginFieldType.INT32)
-    pf_hidden_size = trt.PluginField("hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
-    pf_num_heads = trt.PluginField("num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32)
-    pf_has_mask = trt.PluginField("has_mask", np.array([has_mask], np.int32), trt.PluginFieldType.INT32)
-
-    pfc = trt.PluginFieldCollection([pf_hidden_size, pf_num_heads, pf_has_mask, pf_type])
-    qkv2ctx_plug = qkv2_plg_creator.create_plugin("qkv2ctx", pfc)
-
-    qkv_in = [mult_all.get_output(0)]
-    if has_mask:
-        qkv_in.append(imask)
-    qkv2ctx = network.add_plugin_v2(qkv_in, qkv2ctx_plug)
-    set_output_name(qkv2ctx, prefix, "context_layer")
-    return qkv2ctx
-
-
-def skipln(prefix, config, init_dict, network, input_tensor, skip, bias=None):
-    """
-    Add the skip layer
-    """
-    idims = input_tensor.shape
-    assert len(idims) == 5
-    hidden_size = idims[2]
-
-    pf_ld = trt.PluginField("ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
-    wbeta = init_dict[prefix + "beta"]
-    pf_beta = trt.PluginField("beta", wbeta.numpy(), trt.PluginFieldType.FLOAT32)
-    wgamma = init_dict[prefix + "gamma"]
-    pf_gamma = trt.PluginField("gamma", wgamma.numpy(), trt.PluginFieldType.FLOAT32)
-    pf_type = trt.PluginField("type_id", np.array([1 if config.use_fp16 else 0], np.int32), trt.PluginFieldType.INT32)
-
-    fields = [pf_ld, pf_beta, pf_gamma, pf_type ]
-
-    if bias:
-        pf_bias = trt.PluginField("bias", bias.numpy(), trt.PluginFieldType.FLOAT32)
-        fields.append(pf_bias)
-
-    pfc = trt.PluginFieldCollection(fields)
-    skipln_plug = skln_plg_creator.create_plugin("skipln", pfc)
-
-    skipln_inputs = [input_tensor, skip]
-    layer = network.add_plugin_v2(skipln_inputs, skipln_plug)
-    return layer
-
-def my_fc(config, network, input_tensor,out_dims, W):
-    pf_out_dims = trt.PluginField('out_dims', np.array([out_dims], dtype=np.int32), trt.PluginFieldType.INT32)
-    pf_W = trt.PluginField('W', W.numpy(), trt.PluginFieldType.FLOAT32)
-    pf_type = trt.PluginField("type_id", np.array([1 if config.use_fp16 else 0], np.int32), trt.PluginFieldType.INT32)
-    pfc = trt.PluginFieldCollection([pf_out_dims, pf_W, pf_type])
-    fc_plugin = fc_plg_creator.create_plugin('fcplugin', pfc)
-    plug_inputs = [input_tensor]
-    out_dense = network.add_plugin_v2(plug_inputs, fc_plugin)
-    return out_dense
-
-
-def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imask):
-    """
-    Add the transformer layer
-    """
-    idims = input_tensor.shape
-    assert len(idims) == 5
-    hidden_size = idims[2]
-
-    B_noop = trt.Weights()
-
-    context_transposed = attention_layer_opt(prefix + "attention_self_", config, init_dict, network, input_tensor, imask)
-    attention_heads = context_transposed.get_output(0)
-
-    # FC0
-    B_aout = init_dict[prefix + B_AOUT]
-    if config.use_int8:
-        W_aout = init_dict[prefix + W_AOUT]
-        attention_out_fc = network.add_convolution(attention_heads, hidden_size, (1, 1), W_aout, B_aout)
-        B_aout = None
-
-        if config.use_fp16:
-            attention_out_fc.precision = trt.DataType.INT8
-            attention_out_fc.set_output_type(0, trt.DataType.HALF)
-    else:
-        W_aoutT = init_dict[prefix + W_AOUT + '_notrans']
-        attention_out_fc = my_fc(config, network, attention_heads, hidden_size, W_aoutT)
-
-    skiplayer = skipln(prefix + "attention_output_layernorm_",config, init_dict, network, attention_out_fc.get_output(0), input_tensor, B_aout)
-    attention_ln = skiplayer.get_output(0)
-
-    # FC1 + GELU
-    B_mid = init_dict[prefix + B_MID]
-    W_midT = init_dict[prefix + W_MID + '_notrans']
-    mid_dense = my_fc(config, network, attention_ln, config.intermediate_size, W_midT)
-    mid_dense_out = mid_dense.get_output(0)
-
-    pf_type = trt.PluginField("type_id", np.array([1 if config.use_fp16 else 0], np.int32), trt.PluginFieldType.INT32)
-    pf_bias = trt.PluginField("bias", B_mid.numpy(), trt.PluginFieldType.FLOAT32)
-    pfc = trt.PluginFieldCollection([pf_type, pf_bias])
-
-    plug = gelu_plg_creator.create_plugin("gelu", pfc)
-
-    gelu_layer = network.add_plugin_v2([mid_dense_out], plug)
-
-    intermediate_act = gelu_layer.get_output(0)
-    set_tensor_name(intermediate_act, prefix, "gelu")
-    
-    if config.use_int8 and config.use_strict:
-        intermediate_act.set_dynamic_range(-10, 10)
-
-    # FC2
-    # Dense to hidden size
-    B_lout = init_dict[prefix + B_LOUT]
-    if config.use_int8 and config.use_strict and not config.use_fc2_gemm:
-        W_lout = init_dict[prefix + W_LOUT]
-        out_dense = network.add_convolution(intermediate_act, hidden_size, (1, 1), W_lout, B_lout)
-        B_lout = None
-    else:
-        W_loutT = init_dict[prefix + W_LOUT + '_notrans']
-        out_dense = my_fc(config, network, intermediate_act, hidden_size, W_loutT)
-
-    set_output_name(out_dense, prefix + "output_", "dense")
-    out_layer = skipln(prefix + "output_layernorm_", config, init_dict, network, out_dense.get_output(0), attention_ln, B_lout)
-    out_ln = out_layer.get_output(0)
-
-    set_tensor_name(out_ln, prefix + "output_", "reshape")
-
-    return out_ln
-
-
-def bert_model(config, init_dict, network, input_tensor, input_mask):
-    """
-    Create the bert model
-    """
-    prev_input = input_tensor
-    for layer in range(0, config.num_hidden_layers):
-        ss = "l{}_".format(layer)
-        prev_input = transformer_layer_opt(ss, config,  init_dict, network, prev_input, input_mask)
-    return prev_input
-
-
-def squad_output(prefix, config, init_dict, network, input_tensor):
-    """
-    Create the squad output
-    """
-
-    idims = input_tensor.shape
-    assert len(idims) == 5
-    B, S, hidden_size, _, _ = idims
-
-    W_out = init_dict[prefix + SQD_W]
-    B_out = init_dict[prefix + SQD_B]
-
-    W = network.add_constant((1, hidden_size, 2), W_out)
-    dense = network.add_fully_connected(input_tensor, 2, W_out, B_out)
-
-    OUT = network.add_shuffle(dense.get_output(0))
-    OUT.second_transpose = (1, 0, 2, 3, 4)
-    set_output_name(OUT, prefix, "squad_logits")
-    return OUT
-
-
-def load_weights(inputbase, config):
-    """
-    Load the weights from the tensorflow checkpoint
-    """
-    weights_dict = dict()
-
-    try:
-        reader = pyTF.NewCheckpointReader(inputbase)
-        tensor_dict = reader.get_variable_to_shape_map()
-
-        # There might be training-related variables in the checkpoint that can be discarded
-        param_names = [key for key in sorted(tensor_dict) if 'adam' not in key and 'global_step' not in key and 'pooler' not in key]
-        count = len(param_names)
-        TRT_LOGGER.log(TRT_LOGGER.INFO, "Found {:} entries in weight map".format(count))
-
-        for pn in param_names:
-            toks = pn.lower().split('/')
-            if 'encoder' in pn:
-                assert ('layer' in pn)
-                l = (re.findall('\d+', pn))[0]
-                outname = 'l{}_'.format(l) + '_'.join(toks[3:])
-            else:
-                outname = '_'.join(toks)
-
-            tensor = reader.get_tensor(pn)
-            shape = tensor.shape
-            if pn.find('kernel') != -1:
-                weights_dict[outname +'_notrans'] = trt.Weights(np.ascontiguousarray(tensor).flatten())
-
-                TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Transposing {}\n".format(np))
-                tensor = np.transpose(tensor)
-
-
-            shape = tensor.shape
-            flat_tensor = tensor.flatten()
-            shape_str = '{} '.format(len(shape)) + ' '.join([str(d) for d in shape])
-            weights_dict[outname] = trt.Weights(flat_tensor)
-
-            TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Orig.name: {:}, TRT name: {:}, shape: {:}".format(pn, outname, shape_str))
-
-        N = config.num_attention_heads
-        H = config.head_size
-
-        additional_dict = dict()
-        for key, value in weights_dict.items():
-            pos = key.find(BQ)
-            if pos != -1:
-                hidden_size = value.size
-                prefix = key[:pos]
-
-                Bq_ = value
-                Bk_ = weights_dict[prefix + BK]
-                Bv_ = weights_dict[prefix + BV]
-                Wq_ = weights_dict[prefix + WQ]
-                Wk_ = weights_dict[prefix + WK]
-                Wv_ = weights_dict[prefix + WV]
-
-                mat_size = hidden_size * hidden_size
-                wcount = 3 * mat_size
-                Wall = np.zeros(wcount, np.float32)
-                bcount = 3 * hidden_size
-                Ball = np.zeros(bcount, np.float32)
-                Wall[0:mat_size] = Wq_.numpy()[0:mat_size]
-                Wall[mat_size:2*mat_size] = Wk_.numpy()[0:mat_size]
-                Wall[2*mat_size:3*mat_size] = Wv_.numpy()[0:mat_size]
-                Ball[0:hidden_size] = Bq_.numpy()[0:hidden_size]
-                Ball[hidden_size:2*hidden_size] = Bk_.numpy()[0:hidden_size]
-                Ball[2*hidden_size:3*hidden_size] = Bv_.numpy()[0:hidden_size]
-
-                Wall = np.ascontiguousarray(Wall.reshape((3,N,H,N,H)).transpose((1,0, 2,3,4)), dtype=np.float32)
-                Ball = np.ascontiguousarray(Ball.reshape((3,N,H)).transpose((1,0, 2)), dtype=np.float32)
-
-                additional_dict[prefix + WQKV] = trt.Weights(Wall)
-                additional_dict[prefix + BQKV] = trt.Weights(Ball)
-
-                additional_dict[prefix + WQKV + "_notrans"] = trt.Weights(Wall.T)
-
-    except Exception as error:
-        TRT_LOGGER.log(TRT_LOGGER.ERROR, str(error))
-
-    weights_dict.update(additional_dict)
-    return weights_dict
-
-
-def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_length, batch_sizes):
-    if len(batch_sizes) > 1:
-        input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(sequence_length, -1))
-        segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(sequence_length, -1))
-        input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(sequence_length, -1))
-
-        # Specify profiles for the batch sizes we're interested in.
-        # Make sure the profile also works for all sizes not covered by the previous profile.
-        prev_size = 0
-        for batch_size in sorted(batch_sizes):
-            profile = builder.create_optimization_profile()
-            min_shape = (sequence_length, prev_size + 1)
-            shape = (sequence_length, batch_size)
-            profile.set_shape("input_ids", min=min_shape, opt=shape, max=shape)
-            profile.set_shape("segment_ids", min=min_shape, opt=shape, max=shape)
-            profile.set_shape("input_mask", min=min_shape, opt=shape, max=shape)
-            builder_config.add_optimization_profile(profile)
-            prev_size = batch_size
-    else:
-        input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(sequence_length, batch_sizes[0]))
-        segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(sequence_length, batch_sizes[0]))
-        input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(sequence_length, batch_sizes[0]))
-
-    wbeta = trt.PluginField("bert_embeddings_layernorm_beta", weights_dict["bert_embeddings_layernorm_beta"].numpy(), trt.PluginFieldType.FLOAT32)
-    wgamma = trt.PluginField("bert_embeddings_layernorm_gamma", weights_dict["bert_embeddings_layernorm_gamma"].numpy(), trt.PluginFieldType.FLOAT32)
-    wwordemb = trt.PluginField("bert_embeddings_word_embeddings", weights_dict["bert_embeddings_word_embeddings"].numpy(), trt.PluginFieldType.FLOAT32)
-    wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings", weights_dict["bert_embeddings_token_type_embeddings"].numpy(), trt.PluginFieldType.FLOAT32)
-    wposemb = trt.PluginField("bert_embeddings_position_embeddings", weights_dict["bert_embeddings_position_embeddings"].numpy(), trt.PluginFieldType.FLOAT32)
-
-    output_fp16 = trt.PluginField("output_fp16", np.array([1 if config.use_fp16 else 0]).astype(np.int32), trt.PluginFieldType.INT32)
-
-    pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16])
-    fn = emln_plg_creator.create_plugin("embeddings", pfc)
-
-    inputs = [input_ids, segment_ids, input_mask]
-    emb_layer = network.add_plugin_v2(inputs, fn)
-    set_output_name(emb_layer, "embeddings_", "output")
-    return emb_layer
-
-
-def build_engine(batch_sizes, sequence_length, config, weights_dict, squad_json, vocab_file, calibrationCacheFile, calib_num):
-    explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-
-    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(explicit_batch_flag) as network, builder.create_builder_config() as builder_config:
-        builder_config.max_workspace_size = 5000 * (1024 * 1024) # 5000 MiB
-        if config.use_fp16:
-            builder_config.set_flag(trt.BuilderFlag.FP16)
-        if config.use_int8:
-            calibrator = BertCalibrator(squad_json, vocab_file, calibrationCacheFile, 1, sequence_length, calib_num)
-            builder_config.set_flag(trt.BuilderFlag.INT8)
-            builder_config.int8_calibrator = calibrator
-        if config.use_strict:
-            builder_config.set_flag(trt.BuilderFlag.STRICT_TYPES)
-
-        # Create the network
-        emb_layer = emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_length, batch_sizes)
-        embeddings = emb_layer.get_output(0)
-        mask_idx = emb_layer.get_output(1)
-
-        bert_out = bert_model(config, weights_dict, network, embeddings, mask_idx)
-
-        squad_logits = squad_output("cls_", config, weights_dict, network, bert_out)
-        squad_logits_out = squad_logits.get_output(0)
-
-        network.mark_output(squad_logits_out)
-
-        build_start_time = time.time()
-        engine = builder.build_engine(network, builder_config)
-        build_time_elapsed = (time.time() - build_start_time)
-        TRT_LOGGER.log(TRT_LOGGER.INFO, "build engine in {:.3f} Sec".format(build_time_elapsed))
-        if config.use_int8:
-            calibrator.free()
-        return engine
-
-def generate_calibration_cache(sequence_length, config, weights_dict, squad_json, vocab_file, calib_num):
-    # dynamic shape not working with calibration, so we need generate a calibration cache first using fulldims network
-    calibrationCacheFile = "bertSquadCalibCache"
-    if not config.use_int8 or os.path.exists(calibrationCacheFile):
-        return calibrationCacheFile
-
-    # generate calibration cache
-    saved_use_fp16 = config.use_fp16
-    config.use_fp16 = False
-
-    with build_engine([1], sequence_length, config, weights_dict, squad_json, vocab_file, calibrationCacheFile, calib_num) as engine:
-        TRT_LOGGER.log(TRT_LOGGER.INFO, "calibration cache generated in {:}".format(calibrationCacheFile))
-
-    config.use_fp16 = saved_use_fp16
-    return calibrationCacheFile
-
-def main():
-    parser = argparse.ArgumentParser(description='TensorRT BERT Sample', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('-m', '--ckpt', required=True,
-                        help='The checkpoint file basename, e.g.: basename(model.ckpt-766908.data-00000-of-00001) is model.ckpt-766908')
-    parser.add_argument('-o', '--output', required=True, default="bert_base_384.engine", help='The bert engine file, ex bert.engine')
-    parser.add_argument('-b', '--batch-size', default=[], action="append", help='Batch size(s) to optimize for. The engine will be usable with any batch size below this, but may not be optimal for smaller sizes. Can be specified multiple times to optimize for more than one batch size.', type=int)
-    parser.add_argument('-s', '--sequence-length', default=128, help='Sequence length of the BERT model', type=int)
-    parser.add_argument('-c', '--config-dir', required=True,
-                        help='The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google')
-    parser.add_argument('-f', '--fp16', action='store_true', help='Indicates that inference should be run in FP16 precision', required=False)
-    parser.add_argument('-i', '--int8', action='store_true', help='Indicates that inference should be run in INT8 precision', required=False)
-    parser.add_argument('-t', '--strict', action='store_true', help='Indicates that inference should be run in strict precision mode', required=False)
-    parser.add_argument('-j', '--squad-json', default='squad/dev-v1.1.json', help='squad json dataset used for int8 calibration', required=False)
-    parser.add_argument('-v', '--vocab-file', default='./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt', help='Path to file containing entire understandable vocab', required=False)
-    parser.add_argument('-n', '--calib-num', default=100, help='calibration batch numbers', type=int)
-    parser.add_argument('-g', '--force-fc2-gemm', action='store_true', help='Force use gemm to implement FC2 layer', required=False)
-
-    args, _ = parser.parse_known_args()
-    args.batch_size = args.batch_size or [1]
-
-    bert_config_path = os.path.join(args.config_dir, 'bert_config.json')
-    TRT_LOGGER.log(TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path))
-    config = BertConfig(bert_config_path, args.fp16, args.int8, args.strict, args.force_fc2_gemm)
-
-    weights_dict = load_weights(args.ckpt, config)
-    calib_cache = generate_calibration_cache(args.sequence_length, config, weights_dict, args.squad_json, args.vocab_file, args.calib_num)
-
-    with build_engine(args.batch_size, args.sequence_length, config, weights_dict, args.squad_json, args.vocab_file, calib_cache, args.calib_num) as engine:
-        TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Serializing Engine...")
-        serialized_engine = engine.serialize()
-        TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(args.output))
-        with open(args.output, 'wb') as fout:
-            fout.write(serialized_engine)
-        TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.")
-
-
-if __name__ == '__main__':
-    main()

+ 0 - 0
TensorFlow/LanguageModeling/BERT/trt/helpers/__init__.py


+ 0 - 493
TensorFlow/LanguageModeling/BERT/trt/helpers/data_processing.py

@@ -1,493 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Modifications copyright (C) 2019 NVIDIA Corp.
-
-import helpers.tokenization as tokenization
-import collections
-import numpy as np
-import six
-import math
-import json
-
-
-def convert_doc_tokens(paragraph_text):
-
-    """ Return the list of tokens from the doc text """
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-
-    doc_tokens = []
-    prev_is_whitespace = True
-    for c in paragraph_text:
-        if is_whitespace(c):
-            prev_is_whitespace = True
-        else:
-            if prev_is_whitespace:
-                doc_tokens.append(c)
-            else:
-                doc_tokens[-1] += c
-            prev_is_whitespace = False
-
-    return doc_tokens
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-
-    # Because of the sliding window approach taken to scoring documents, a single
-    # token can appear in multiple documents. E.g.
-    #  Doc: the man went to the store and bought a gallon of milk
-    #  Span A: the man went to the
-    #  Span B: to the store and bought
-    #  Span C: and bought a gallon of
-    #  ...
-    #
-    # Now the word 'bought' will have two scores from spans B and C. We only
-    # want to consider the score with "maximum context", which we define as
-    # the *minimum* of its left and right context (the *sum* of left and
-    # right context will always be the same, of course).
-    #
-    # In the example the maximum context for 'bought' would be span C since
-    # it has 1 left context and 3 right context, while span B has 4 left context
-    # and 0 right context.
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-def convert_example_to_features(doc_tokens, question_text, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    query_tokens = tokenizer.tokenize(question_text)
-
-    if len(query_tokens) > max_query_length:
-        query_tokens = query_tokens[0:max_query_length]
-
-    tok_to_orig_index = []
-    orig_to_tok_index = []
-    all_doc_tokens = []
-    for (i, token) in enumerate(doc_tokens):
-        orig_to_tok_index.append(len(all_doc_tokens))
-        sub_tokens = tokenizer.tokenize(token)
-        for sub_token in sub_tokens:
-            tok_to_orig_index.append(i)
-            all_doc_tokens.append(sub_token)
-
-    # The -3 accounts for [CLS], [SEP] and [SEP]
-    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-
-    # We can have documents that are longer than the maximum sequence length.
-    # To deal with this we do a sliding window approach, where we take chunks
-    # of the up to our max length with a stride of `doc_stride`.
-    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-        "DocSpan", ["start", "length"])
-    doc_spans = []
-    start_offset = 0
-    while start_offset < len(all_doc_tokens):
-        length = len(all_doc_tokens) - start_offset
-        if length > max_tokens_for_doc:
-            length = max_tokens_for_doc
-        doc_spans.append(_DocSpan(start=start_offset, length=length))
-        if start_offset + length == len(all_doc_tokens):
-            break
-        start_offset += min(length, doc_stride)
-
-    _Feature = collections.namedtuple(  # pylint: disable=invalid-name
-            "Feature",
-            ["input_ids", "input_mask", "segment_ids", "tokens", "token_to_orig_map", "token_is_max_context"])
-
-        
-    features = []
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
-        tokens = []
-        token_to_orig_map = {}
-        token_is_max_context = {}
-        segment_ids = []
-        tokens.append("[CLS]")
-        segment_ids.append(0)
-        for token in query_tokens:
-            tokens.append(token)
-            segment_ids.append(0)
-        tokens.append("[SEP]")
-        segment_ids.append(0)
-
-        for i in range(doc_span.length):
-            split_token_index = doc_span.start + i
-            token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-            is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
-            token_is_max_context[len(tokens)] = is_max_context
-            tokens.append(all_doc_tokens[split_token_index])
-            segment_ids.append(1)
-        tokens.append("[SEP]")
-        segment_ids.append(1)
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        while len(input_ids) < max_seq_length:
-            input_ids.append(0)
-            input_mask.append(0)
-            segment_ids.append(0)
-
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-
-        def create_int_feature(values):
-            feature = np.asarray(values, dtype=np.int32, order=None)
-            return feature
-
-
-        features.append(_Feature(
-            input_ids = create_int_feature(input_ids),
-            input_mask = create_int_feature(input_mask),
-            segment_ids = create_int_feature(segment_ids),
-            tokens = tokens,
-            token_to_orig_map = token_to_orig_map,
-            token_is_max_context = token_is_max_context
-            ))
-    return features
-
-
-def read_squad_json(input_file):
-    """read from squad json into a list of examples"""
-    with open(input_file, "r", encoding='utf-8') as reader:
-        input_data = json.load(reader)["data"]
-
-    _Example = collections.namedtuple(  # pylint: disable=invalid-name
-            "Example",
-            ["id", "question_text", "doc_tokens"])
-
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = convert_doc_tokens(paragraph_text)
-
-            for qa in paragraph["qas"]:
-                examples.append(_Example(
-                    id = qa["id"],
-                    question_text = qa["question"],
-                    doc_tokens = doc_tokens
-                    ))
-
-    return examples
-
-
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def get_final_text(pred_text, orig_text, do_lower_case):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heruistic between
-    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        return orig_text
-
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-
-
-def get_predictions(doc_tokens, features, results, n_best_size, max_answer_length, version_2_with_negative = False):
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
-
-    prediction = ""
-    scores_diff_json = 0.0
-
-    prelim_predictions = []
-    # keep track of the minimum score of null start+end of position 0
-    score_null = 1000000  # large and positive
-    null_start_logit = 0  # the start logit at the slice with min null score
-    null_end_logit = 0  # the end logit at the slice with min null score
-    
-    for result in results:
-        start_indexes = _get_best_indexes(result.start_logits, n_best_size)
-        end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-        feature = features[result.feature_index]
-
-        # if we could have irrelevant answers, get the min score of irrelevant
-        if version_2_with_negative:
-            feature_null_score = result.start_logits[0] + result.end_logits[0]
-            if feature_null_score < score_null:
-                score_null = feature_null_score
-                null_start_logit = result.start_logits[0]
-                null_end_logit = result.end_logits[0]
-
-        for start_index in start_indexes:
-            for end_index in end_indexes:
-                # We could hypothetically create invalid predictions, e.g., predict
-                # that the start of the span is in the question. We throw out all
-                # invalid predictions.
-                if start_index >= len(feature.tokens):
-                    continue
-                if end_index >= len(feature.tokens):
-                    continue
-                if start_index not in feature.token_to_orig_map:
-                    continue
-                if end_index not in feature.token_to_orig_map:
-                    continue
-                if not feature.token_is_max_context.get(start_index, False):
-                    continue
-                if end_index < start_index:
-                    continue
-                length = end_index - start_index + 1
-                if length > max_answer_length:
-                    continue
-                prelim_predictions.append(
-                    _PrelimPrediction(
-                        feature_index=result.feature_index,
-                        start_index=start_index,
-                        end_index=end_index,
-                        start_logit=result.start_logits[start_index],
-                        end_logit=result.end_logits[end_index]))
-
-    if version_2_with_negative:
-        prelim_predictions.append(
-            _PrelimPrediction(
-                feature_index=result.feature_index,
-                start_index=0,
-                end_index=0,
-                start_logit=null_start_logit,
-                end_logit=null_end_logit))
-
-    prelim_predictions = sorted(
-        prelim_predictions,
-        key=lambda x: (x.start_logit + x.end_logit),
-        reverse=True)
-
-    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_logit", "end_logit"])
-
-    seen_predictions = {}
-    nbest = []
-    for pred in prelim_predictions:
-        if len(nbest) >= n_best_size:
-            break
-
-        if pred.start_index > 0:  # this is a non-null prediction
-            feature = features[pred.feature_index]
-            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
-            orig_doc_start = feature.token_to_orig_map[pred.start_index]
-            orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = doc_tokens[orig_doc_start:(orig_doc_end + 1)]
-            tok_text = " ".join(tok_tokens)
-
-            # De-tokenize WordPieces that have been split off.
-            tok_text = tok_text.replace(" ##", "")
-            tok_text = tok_text.replace("##", "")
-
-            # Clean whitespace
-            tok_text = tok_text.strip()
-            tok_text = " ".join(tok_text.split())
-            orig_text = " ".join(orig_tokens)
-
-            final_text = get_final_text(tok_text, orig_text, True)
-            if final_text in seen_predictions:
-                continue
-
-            seen_predictions[final_text] = True
-        else:
-            final_text = ""
-            seen_predictions[final_text] = True
-
-        if len(final_text):
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
-
-    # if we didn't inlude the empty option in the n-best, inlcude it
-    if version_2_with_negative:
-        if "" not in seen_predictions:
-            nbest.append(
-                _NbestPrediction(
-                    text="", start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-    # In very rare edge cases we could have no valid predictions. So we
-    # just create a nonce prediction in this case to avoid failure.
-    if not nbest:
-        nbest.append(
-            _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-    assert len(nbest) >= 1
-
-    total_scores = []
-    best_non_null_entry = None
-    for entry in nbest:
-        total_scores.append(entry.start_logit + entry.end_logit)
-        if not best_non_null_entry:
-            if entry.text:
-                best_non_null_entry = entry
-
-    probs = _compute_softmax(total_scores)
-
-    nbest_json = []
-    for (i, entry) in enumerate(nbest):
-        output = collections.OrderedDict()
-        output["text"] = entry.text
-        output["probability"] = probs[i]
-        output["start_logit"] = entry.start_logit
-        output["end_logit"] = entry.end_logit
-        nbest_json.append(output)
-
-    assert len(nbest_json) >= 1
-
-    null_score_diff_threshold = 0.0
-    if not version_2_with_negative:
-        prediction = nbest_json[0]["text"]
-    else:
-        # predict "" iff the null score - the score of best non-null > threshold
-        score_diff = score_null - best_non_null_entry.start_logit - (
-            best_non_null_entry.end_logit)
-        scores_diff_json = score_diff
-        if score_diff > null_score_diff_threshold:
-            prediction = ""
-        else:
-            prediction = best_non_null_entry.text
-
-    return prediction, nbest_json, scores_diff_json

+ 0 - 429
TensorFlow/LanguageModeling/BERT/trt/helpers/tokenization.py

@@ -1,429 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import re
-import unicodedata
-import six
-
-
-def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
-  """Checks whether the casing config is consistent with the checkpoint name."""
-
-  # The casing has to be passed in by the user and there is no explicit check
-  # as to whether it matches the checkpoint. The casing information probably
-  # should have been stored in the bert_config.json file, but it's not, so
-  # we have to heuristically detect it to validate.
-
-  if not init_checkpoint:
-    return
-
-  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
-  if m is None:
-    return
-
-  model_name = m.group(1)
-
-  lower_models = [
-      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
-      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
-  ]
-
-  cased_models = [
-      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
-      "multi_cased_L-12_H-768_A-12"
-  ]
-
-  is_bad_config = False
-  if model_name in lower_models and not do_lower_case:
-    is_bad_config = True
-    actual_flag = "False"
-    case_name = "lowercased"
-    opposite_flag = "True"
-
-  if model_name in cased_models and do_lower_case:
-    is_bad_config = True
-    actual_flag = "True"
-    case_name = "cased"
-    opposite_flag = "False"
-
-  if is_bad_config:
-    raise ValueError(
-        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
-        "However, `%s` seems to be a %s model, so you "
-        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
-        "how the model was pre-training. If this error is wrong, please "
-        "just comment out this check." % (actual_flag, init_checkpoint,
-                                          model_name, case_name, opposite_flag))
-
-
-def convert_to_unicode(text):
-  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-  if six.PY3:
-    if isinstance(text, str):
-      return text
-    elif isinstance(text, bytes):
-      return text.decode("utf-8", "ignore")
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  elif six.PY2:
-    if isinstance(text, str):
-      return text.decode("utf-8", "ignore")
-    elif isinstance(text, unicode):
-      return text
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  else:
-    raise ValueError("Not running on Python2 or Python 3?")
-
-
-def printable_text(text):
-  """Returns text encoded in a way suitable for print or `tf.logging`."""
-
-  # These functions want `str` for both Python2 and Python3, but in one case
-  # it's a Unicode string and in the other it's a byte string.
-  if six.PY3:
-    if isinstance(text, str):
-      return text
-    elif isinstance(text, bytes):
-      return text.decode("utf-8", "ignore")
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  elif six.PY2:
-    if isinstance(text, str):
-      return text
-    elif isinstance(text, unicode):
-      return text.encode("utf-8")
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  else:
-    raise ValueError("Not running on Python2 or Python 3?")
-
-
-def load_vocab(vocab_file):
-  """Loads a vocabulary file into a dictionary."""
-  vocab = collections.OrderedDict()
-  index = 0
-  with open(vocab_file, "r", encoding='utf-8') as reader:
-    while True:
-      token = convert_to_unicode(reader.readline())
-      if not token:
-        break
-      token = token.strip()
-      vocab[token] = index
-      index += 1
-  return vocab
-
-
-def convert_by_vocab(vocab, items):
-  """Converts a sequence of [tokens|ids] using the vocab."""
-  output = []
-  for item in items:
-    output.append(vocab[item])
-  return output
-
-
-def convert_tokens_to_ids(vocab, tokens):
-  return convert_by_vocab(vocab, tokens)
-
-
-def convert_ids_to_tokens(inv_vocab, ids):
-  return convert_by_vocab(inv_vocab, ids)
-
-
-def whitespace_tokenize(text):
-  """Runs basic whitespace cleaning and splitting on a piece of text."""
-  text = text.strip()
-  if not text:
-    return []
-  tokens = text.split()
-  return tokens
-
-
-class FullTokenizer(object):
-  """Runs end-to-end tokenziation."""
-
-  def __init__(self, vocab_file, do_lower_case=True):
-    self.vocab = load_vocab(vocab_file)
-    self.inv_vocab = {v: k for k, v in self.vocab.items()}
-    self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-
-  def tokenize(self, text):
-    split_tokens = []
-    for token in self.basic_tokenizer.tokenize(text):
-      for sub_token in self.wordpiece_tokenizer.tokenize(token):
-        split_tokens.append(sub_token)
-
-    return split_tokens
-
-  def convert_tokens_to_ids(self, tokens):
-    return convert_by_vocab(self.vocab, tokens)
-
-  def convert_ids_to_tokens(self, ids):
-    return convert_by_vocab(self.inv_vocab, ids)
-
-
-class BertTokenizer(object):
-    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
-
-    def __init__(self, vocab_file, do_lower_case=True):
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict(
-            [(ids, tok) for tok, ids in self.vocab.items()])
-        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-
-    def tokenize(self, text):
-        split_tokens = []
-        for token in self.basic_tokenizer.tokenize(text):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-        return split_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        """Converts a sequence of tokens into ids using the vocab."""
-        ids = []
-        for token in tokens:
-            ids.append(self.vocab[token])
-        return ids
-
-    def convert_ids_to_tokens(self, ids):
-        """Converts a sequence of ids in wordpiece tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            tokens.append(self.ids_to_tokens[i])
-        return tokens
-
-class BasicTokenizer(object):
-  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-  def __init__(self, do_lower_case=True):
-    """Constructs a BasicTokenizer.
-
-    Args:
-      do_lower_case: Whether to lower case the input.
-    """
-    self.do_lower_case = do_lower_case
-
-  def tokenize(self, text):
-    """Tokenizes a piece of text."""
-    text = convert_to_unicode(text)
-    text = self._clean_text(text)
-
-    # This was added on November 1st, 2018 for the multilingual and Chinese
-    # models. This is also applied to the English models now, but it doesn't
-    # matter since the English models were not trained on any Chinese data
-    # and generally don't have any Chinese data in them (there are Chinese
-    # characters in the vocabulary because Wikipedia does have some Chinese
-    # words in the English Wikipedia.).
-    text = self._tokenize_chinese_chars(text)
-
-    orig_tokens = whitespace_tokenize(text)
-    split_tokens = []
-    for token in orig_tokens:
-      if self.do_lower_case:
-        token = token.lower()
-        token = self._run_strip_accents(token)
-      split_tokens.extend(self._run_split_on_punc(token))
-
-    output_tokens = whitespace_tokenize(" ".join(split_tokens))
-    return output_tokens
-
-  def _run_strip_accents(self, text):
-    """Strips accents from a piece of text."""
-    text = unicodedata.normalize("NFD", text)
-    output = []
-    for char in text:
-      cat = unicodedata.category(char)
-      if cat == "Mn":
-        continue
-      output.append(char)
-    return "".join(output)
-
-  def _run_split_on_punc(self, text):
-    """Splits punctuation on a piece of text."""
-    chars = list(text)
-    i = 0
-    start_new_word = True
-    output = []
-    while i < len(chars):
-      char = chars[i]
-      if _is_punctuation(char):
-        output.append([char])
-        start_new_word = True
-      else:
-        if start_new_word:
-          output.append([])
-        start_new_word = False
-        output[-1].append(char)
-      i += 1
-
-    return ["".join(x) for x in output]
-
-  def _tokenize_chinese_chars(self, text):
-    """Adds whitespace around any CJK character."""
-    output = []
-    for char in text:
-      cp = ord(char)
-      if self._is_chinese_char(cp):
-        output.append(" ")
-        output.append(char)
-        output.append(" ")
-      else:
-        output.append(char)
-    return "".join(output)
-
-  def _is_chinese_char(self, cp):
-    """Checks whether CP is the codepoint of a CJK character."""
-    # This defines a "chinese character" as anything in the CJK Unicode block:
-    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-    #
-    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-    # despite its name. The modern Korean Hangul alphabet is a different block,
-    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-    # space-separated words, so they are not treated specially and handled
-    # like the all of the other languages.
-    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-        (cp >= 0x3400 and cp <= 0x4DBF) or  #
-        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-        (cp >= 0x2B820 and cp <= 0x2CEAF) or
-        (cp >= 0xF900 and cp <= 0xFAFF) or  #
-        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-      return True
-
-    return False
-
-  def _clean_text(self, text):
-    """Performs invalid character removal and whitespace cleanup on text."""
-    output = []
-    for char in text:
-      cp = ord(char)
-      if cp == 0 or cp == 0xfffd or _is_control(char):
-        continue
-      if _is_whitespace(char):
-        output.append(" ")
-      else:
-        output.append(char)
-    return "".join(output)
-
-
-class WordpieceTokenizer(object):
-  """Runs WordPiece tokenziation."""
-
-  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
-    self.vocab = vocab
-    self.unk_token = unk_token
-    self.max_input_chars_per_word = max_input_chars_per_word
-
-  def tokenize(self, text):
-    """Tokenizes a piece of text into its word pieces.
-
-    This uses a greedy longest-match-first algorithm to perform tokenization
-    using the given vocabulary.
-
-    For example:
-      input = "unaffable"
-      output = ["un", "##aff", "##able"]
-
-    Args:
-      text: A single token or whitespace separated tokens. This should have
-        already been passed through `BasicTokenizer.
-
-    Returns:
-      A list of wordpiece tokens.
-    """
-
-    text = convert_to_unicode(text)
-
-    output_tokens = []
-    for token in whitespace_tokenize(text):
-      chars = list(token)
-      if len(chars) > self.max_input_chars_per_word:
-        output_tokens.append(self.unk_token)
-        continue
-
-      is_bad = False
-      start = 0
-      sub_tokens = []
-      while start < len(chars):
-        end = len(chars)
-        cur_substr = None
-        while start < end:
-          substr = "".join(chars[start:end])
-          if start > 0:
-            substr = "##" + substr
-          if substr in self.vocab:
-            cur_substr = substr
-            break
-          end -= 1
-        if cur_substr is None:
-          is_bad = True
-          break
-        sub_tokens.append(cur_substr)
-        start = end
-
-      if is_bad:
-        output_tokens.append(self.unk_token)
-      else:
-        output_tokens.extend(sub_tokens)
-    return output_tokens
-
-
-def _is_whitespace(char):
-  """Checks whether `chars` is a whitespace character."""
-  # \t, \n, and \r are technically contorl characters but we treat them
-  # as whitespace since they are generally considered as such.
-  if char == " " or char == "\t" or char == "\n" or char == "\r":
-    return True
-  cat = unicodedata.category(char)
-  if cat == "Zs":
-    return True
-  return False
-
-
-def _is_control(char):
-  """Checks whether `chars` is a control character."""
-  # These are technically control characters but we count them as whitespace
-  # characters.
-  if char == "\t" or char == "\n" or char == "\r":
-    return False
-  cat = unicodedata.category(char)
-  if cat.startswith("C"):
-    return True
-  return False
-
-
-def _is_punctuation(char):
-  """Checks whether `chars` is a punctuation character."""
-  cp = ord(char)
-  # We treat all non-letter/number ASCII as punctuation.
-  # Characters such as "^", "$", and "`" are not in the Unicode
-  # Punctuation class but we treat them as punctuation anyways, for
-  # consistency.
-  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-    return True
-  cat = unicodedata.category(char)
-  if cat.startswith("P"):
-    return True
-  return False

+ 0 - 357
TensorFlow/LanguageModeling/BERT/trt/inference.ipynb

@@ -1,357 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
-    "#\n",
-    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-    "# you may not use this file except in compliance with the License.\n",
-    "# You may obtain a copy of the License at\n",
-    "#\n",
-    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
-    "#\n",
-    "# Unless required by applicable law or agreed to in writing, software\n",
-    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-    "# See the License for the specific language governing permissions and\n",
-    "# limitations under the License.\n",
-    "# =============================================================================="
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<img src=\"https://upload.wikimedia.org/wikipedia/en/6/6d/Nvidia_image_logo.svg\" style=\"width: 90px; float: right;\">\n",
-    "\n",
-    "# QA Inference on BERT using TensorRT"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Overview\n",
-    "\n",
-    "Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. \n",
-    "\n",
-    "The original paper can be found here: https://arxiv.org/abs/1810.04805.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1.a Learning objectives\n",
-    "\n",
-    "This notebook demonstrates:\n",
-    "- Inference on Question Answering (QA) task with BERT Base/Large model\n",
-    "- The use fine-tuned NVIDIA BERT models\n",
-    "- Use of BERT model with TRT"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Requirements\n",
-    "\n",
-    "Please refer to the ReadMe file"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. BERT Inference: Question Answering\n",
-    "\n",
-    "We can run inference on a fine-tuned BERT model for tasks like Question Answering.\n",
-    "\n",
-    "Here we use a BERT model fine-tuned on a [SQuaD 2.0 Dataset](https://rajpurkar.github.io/SQuAD-explorer/) which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 3.a Paragraph and Queries\n",
-    "\n",
-    "The paragraph and the questions can be customized by changing the text below. Note that when using models with small sequence lengths, you should use a shorter paragraph:"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Paragraph:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "paragraph_text = \"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"\n",
-    "\n",
-    "# Short paragraph version for BERT models with max sequence length of 128\n",
-    "short_paragraph_text = \"The Apollo program was the third United States human spaceflight program. First conceived as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was dedicated to President John F. Kennedy's national goal of landing a man on the Moon. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972 followed by the Apollo-Soyuz Test Project a joint Earth orbit mission with the Soviet Union in 1975.\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Question:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "question_text = \"What project put the first Americans into space?\"\n",
-    "#question_text =  \"What year did the first manned Apollo flight occur?\"\n",
-    "#question_text =  \"What President is credited with the original notion of putting Americans in space?\"\n",
-    "#question_text =  \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this example we ask our BERT model questions related to the following paragraph:\n",
-    "\n",
-    "**The Apollo Program**\n",
-    "_\"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"_\n",
-    "\n",
-    "The questions and relative answers expected are shown below:\n",
-    "\n",
-    " - **Q1:** \"What project put the first Americans into space?\" \n",
-    "  - **A1:** \"Project Mercury\"\n",
-    " - **Q2:** \"What program was created to carry out these projects and missions?\"\n",
-    "  - **A2:** \"The Apollo program\"\n",
-    " - **Q3:** \"What year did the first manned Apollo flight occur?\"\n",
-    "  - **A3:** \"1968\"\n",
-    " - **Q4:** \"What President is credited with the original notion of putting Americans in space?\"\n",
-    "  - **A4:** \"John F. Kennedy\"\n",
-    " - **Q5:** \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\"\n",
-    "  - **A5:** \"Soviet Union\"\n",
-    " - **Q6:** \"How long did Project Apollo run?\"\n",
-    "  - **A6:** \"1961 to 1972\"\n",
-    " - **Q7:** \"What program helped develop space travel techniques that Project Apollo used?\"\n",
-    "  - **A7:** \"Gemini Mission\"\n",
-    " - **Q8:** \"What space station supported three manned missions in 1973-1974?\"\n",
-    "  - **A8:** \"Skylab\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Data Preprocessing\n",
-    "Let's convert the paragraph and the question to BERT input with the help of the tokenizer:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import helpers.data_processing as dp\n",
-    "import helpers.tokenization as tokenization\n",
-    "\n",
-    "tokenizer = tokenization.FullTokenizer(vocab_file=\"/workspace/bert/models/fine-tuned/bert_tf_v2_large_fp16_128_v2/vocab.txt\", do_lower_case=True)\n",
-    "\n",
-    "# The maximum number of tokens for the question. Questions longer than this will be truncated to this length.\n",
-    "max_query_length = 64\n",
-    "\n",
-    "# When splitting up a long document into chunks, how much stride to take between chunks.\n",
-    "doc_stride = 128\n",
-    "\n",
-    "# The maximum total input sequence length after WordPiece tokenization. \n",
-    "# Sequences longer than this will be truncated, and sequences shorter \n",
-    "max_seq_length = 128\n",
-    "\n",
-    "# Extract tokens from the paragraph\n",
-    "doc_tokens = dp.convert_doc_tokens(short_paragraph_text)\n",
-    "\n",
-    "# Extract features from the paragraph and question\n",
-    "features = dp.convert_example_to_features(doc_tokens, question_text, tokenizer, max_seq_length, doc_stride, max_query_length)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## TensorRT Inference"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorrt as trt\n",
-    "TRT_LOGGER = trt.Logger(trt.Logger.INFO)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import ctypes\n",
-    "import os\n",
-    "\n",
-    "ctypes.CDLL(\"libnvinfer_plugin.so\", mode=ctypes.RTLD_GLOBAL)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pycuda.driver as cuda\n",
-    "import pycuda.autoinit\n",
-    "import collections\n",
-    "import numpy as np\n",
-    "import time\n",
-    "\n",
-    "# Load the BERT-Large Engine\n",
-    "with open(\"/workspace/bert/engines/bert_large_128.engine\", \"rb\") as f, \\\n",
-    "    trt.Runtime(TRT_LOGGER) as runtime, \\\n",
-    "    runtime.deserialize_cuda_engine(f.read()) as engine, \\\n",
-    "    engine.create_execution_context() as context:\n",
-    "\n",
-    "     # We always use batch size 1.\n",
-    "    input_shape = (max_seq_length, 1)\n",
-    "    input_nbytes = trt.volume(input_shape) * trt.int32.itemsize\n",
-    "    \n",
-    "    # Allocate device memory for inputs.\n",
-    "    d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]\n",
-    "    # Create a stream in which to copy inputs/outputs and run inference.\n",
-    "    stream = cuda.Stream()\n",
-    "\n",
-    "    # Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)\n",
-    "    # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.\n",
-    "    for binding in range(3):\n",
-    "        context.set_binding_shape(binding, input_shape)\n",
-    "    assert context.all_binding_shapes_specified\n",
-    "\n",
-    "    # Allocate output buffer by querying the size from the context. This may be different for different input shapes.\n",
-    "    h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)\n",
-    "    d_output = cuda.mem_alloc(h_output.nbytes)\n",
-    "\n",
-    "    print(\"\\nRunning Inference...\")\n",
-    "\n",
-    "    _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name\n",
-    "        \"NetworkOutput\",\n",
-    "        [\"start_logits\", \"end_logits\", \"feature_index\"])\n",
-    "    networkOutputs = []\n",
-    "\n",
-    "    eval_time_elapsed = 0\n",
-    "    for feature_index, feature in enumerate(features):\n",
-    "        # Copy inputs\n",
-    "        input_ids = cuda.register_host_memory(np.ascontiguousarray(feature.input_ids.ravel()))\n",
-    "        segment_ids = cuda.register_host_memory(np.ascontiguousarray(feature.segment_ids.ravel()))\n",
-    "        input_mask = cuda.register_host_memory(np.ascontiguousarray(feature.input_mask.ravel()))\n",
-    "\n",
-    "        eval_start_time = time.time()\n",
-    "        cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)\n",
-    "        cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)\n",
-    "        cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)\n",
-    "\n",
-    "        # Run inference\n",
-    "        context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)\n",
-    "        # Synchronize the stream\n",
-    "        stream.synchronize()\n",
-    "        eval_time_elapsed += (time.time() - eval_start_time)\n",
-    "\n",
-    "        # Transfer predictions back from GPU\n",
-    "        cuda.memcpy_dtoh_async(h_output, d_output, stream)\n",
-    "        stream.synchronize()\n",
-    "\n",
-    "        for index, batch in enumerate(h_output):\n",
-    "            # Data Post-processing\n",
-    "            networkOutputs.append(_NetworkOutput(\n",
-    "                start_logits = np.array(batch.squeeze()[:, 0]),\n",
-    "                end_logits = np.array(batch.squeeze()[:, 1]),\n",
-    "                feature_index = feature_index\n",
-    "                ))\n",
-    "\n",
-    "    eval_time_elapsed /= len(features)\n",
-    "    \n",
-    "    print(\"-----------------------------\")\n",
-    "    print(\"Running Inference at {:.3f} Sentences/Sec\".format(1.0/eval_time_elapsed))\n",
-    "    print(\"-----------------------------\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Data Post-Processing"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have the inference results let's extract the actual answer to our question"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "    # The total number of n-best predictions to generate in the nbest_predictions.json output file\n",
-    "    n_best_size = 20\n",
-    "\n",
-    "    # The maximum length of an answer that can be generated. This is needed \n",
-    "    #  because the start and end predictions are not conditioned on one another\n",
-    "    max_answer_length = 30\n",
-    "\n",
-    "    prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features,\n",
-    "        networkOutputs, n_best_size, max_answer_length)\n",
-    "    \n",
-    "    for index, output in enumerate(networkOutputs):\n",
-    "        print(\"Processing output\")\n",
-    "        print(\"Answer: '{}'\".format(prediction))\n",
-    "        print(\"with prob: {:.3f}%\".format(nbest_json[0]['probability'] * 100.0))"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

+ 0 - 222
TensorFlow/LanguageModeling/BERT/trt/inference.py

@@ -1,222 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-import json
-import ctypes
-import argparse
-import collections
-import numpy as np
-import tensorrt as trt
-import pycuda.driver as cuda
-import pycuda.autoinit
-
-import helpers.tokenization as tokenization
-import helpers.data_processing as dp
-
-TRT_LOGGER = trt.Logger(trt.Logger.INFO)
-
-def parse_args():
-    """
-    Parse command line arguments
-    """
-    parser = argparse.ArgumentParser(description='BERT QA Inference')
-    parser.add_argument('-e', '--engine',
-            help='Path to BERT TensorRT engine')
-    parser.add_argument('-p', '--passage', nargs='*',
-            help='Text for paragraph/passage for BERT QA',
-            default='')
-    parser.add_argument('-pf', '--passage-file',
-            help='File containing input passage',
-            default='')
-    parser.add_argument('-q', '--question', nargs='*',
-            help='Text for query/question for BERT QA',
-            default='')
-    parser.add_argument('-qf', '--question-file',
-            help='File containiner input question',
-            default='')
-    parser.add_argument('-sq', '--squad-json',
-            help='SQuAD json file',
-            default='')
-    parser.add_argument('-o', '--output-prediction-file',
-            help='Output prediction file for SQuAD evaluation',
-            default='./predictions.json')
-    parser.add_argument('-v', '--vocab-file',
-            help='Path to file containing entire understandable vocab')
-    parser.add_argument('-s', '--sequence-length',
-            help='The sequence length to use. Defaults to 128',
-            default=128, type=int)
-    parser.add_argument('--max-query-length',
-            help='The maximum length of a query in number of tokens. Queries longer than this will be truncated',
-            default=64, type=int)
-    parser.add_argument('--max-answer-length',
-            help='The maximum length of an answer that can be generated',
-            default=30, type=int)
-    parser.add_argument('--n-best-size',
-            help='Total number of n-best predictions to generate in the nbest_predictions.json output file',
-            default=20, type=int)
-    args, _ = parser.parse_known_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-
-    paragraph_text = None
-    squad_examples = None
-    output_prediction_file = None
-
-    if not args.passage == '':
-        paragraph_text = ' '.join(args.passage)
-    elif not args.passage_file == '':
-        f = open(args.passage_file, 'r')
-        paragraph_text = f.read()
-    elif not args.squad_json == '':
-        squad_examples = dp.read_squad_json(args.squad_json)
-        output_prediction_file = args.output_prediction_file
-    else:
-        paragraph_text = input("Paragraph: ")
-
-    question_text = None
-    if not args.question == '':
-        question_text = ' '.join(args.question)
-    elif not args.question_file == '':
-        f = open(args.question_file, 'r')
-        question_text = f.read()
-
-    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
-    # When splitting up a long document into chunks, how much stride to take between chunks.
-    doc_stride = 128
-    # The maximum total input sequence length after WordPiece tokenization.
-    # Sequences longer than this will be truncated, and sequences shorter
-    max_seq_length = args.sequence_length
-
-    def question_features(tokens, question):
-        # Extract features from the paragraph and question
-        return dp.convert_example_to_features(tokens, question, tokenizer, max_seq_length, doc_stride, args.max_query_length)
-
-    # Import necessary plugins for BERT TensorRT
-    ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
-
-    # The first context created will use the 0th profile. A new context must be created
-    # for each additional profile needed. Here, we only use batch size 1, thus we only need the first profile.
-    with open(args.engine, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime, \
-        runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
-
-        # We always use batch size 1.
-        input_shape = (max_seq_length, 1)
-        input_nbytes = trt.volume(input_shape) * trt.int32.itemsize
-
-        # Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)
-        # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
-        for binding in range(3):
-            context.set_binding_shape(binding, input_shape)
-        assert context.all_binding_shapes_specified
-
-        # Create a stream in which to copy inputs/outputs and run inference.
-        stream = cuda.Stream()
-
-        # Allocate device memory for inputs.
-        d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]
-
-        # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
-        h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)
-        d_output = cuda.mem_alloc(h_output.nbytes)
-
-        def inference(features, tokens):
-            global h_output
-
-            _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
-                    "NetworkOutput",
-                    ["start_logits", "end_logits", "feature_index"])
-            networkOutputs = []
-
-            eval_time_elapsed = 0
-            for feature_index, feature in enumerate(features):
-                # Copy inputs
-                input_ids = cuda.register_host_memory(np.ascontiguousarray(feature.input_ids.ravel()))
-                segment_ids = cuda.register_host_memory(np.ascontiguousarray(feature.segment_ids.ravel()))
-                input_mask = cuda.register_host_memory(np.ascontiguousarray(feature.input_mask.ravel()))
-
-                eval_start_time = time.time()
-                cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)
-                cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)
-                cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)
-
-                # Run inference
-                context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
-                # Synchronize the stream
-                stream.synchronize()
-                eval_time_elapsed += (time.time() - eval_start_time)
-
-                # Transfer predictions back from GPU
-                cuda.memcpy_dtoh_async(h_output, d_output, stream)
-                stream.synchronize()
-
-                for index, batch in enumerate(h_output):
-                    # Data Post-processing
-                    networkOutputs.append(_NetworkOutput(
-                        start_logits = np.array(batch.squeeze()[:, 0]),
-                        end_logits = np.array(batch.squeeze()[:, 1]),
-                        feature_index = feature_index
-                        ))
-
-            eval_time_elapsed /= len(features)
-
-            prediction, nbest_json, scores_diff_json = dp.get_predictions(tokens, features,
-                    networkOutputs, args.n_best_size, args.max_answer_length)
-
-            return eval_time_elapsed, prediction, nbest_json
-
-        def print_single_query(eval_time_elapsed, prediction, nbest_json):
-            print("------------------------")
-            print("Running inference in {:.3f} Sentences/Sec".format(1.0/eval_time_elapsed))
-            print("------------------------")
-
-            print("Answer: '{}'".format(prediction))
-            print("With probability: {:.3f}".format(nbest_json[0]['probability'] * 100.0))
-
-        if squad_examples:
-            all_predictions = collections.OrderedDict()
-
-            for example in squad_examples:
-                features = question_features(example.doc_tokens, example.question_text)
-                eval_time_elapsed, prediction, nbest_json = inference(features, example.doc_tokens)
-                all_predictions[example.id] = prediction
-
-            with open(output_prediction_file, "w") as f:
-                f.write(json.dumps(all_predictions, indent=4))
-                print("\nOutput dump to {}".format(output_prediction_file))
-        else:
-            # Extract tokecs from the paragraph
-            doc_tokens = dp.convert_doc_tokens(paragraph_text)
-
-            if question_text:
-                print("\nPassage: {}".format(paragraph_text))
-                print("\nQuestion: {}".format(question_text))
-
-                features = question_features(doc_tokens, question_text)
-                eval_time_elapsed, prediction, nbest_json = inference(features, doc_tokens)
-                print_single_query(eval_time_elapsed, prediction, nbest_json)
-
-            else:
-                # If no question text is provided, loop until the question is 'exit'
-                EXIT_CMDS = ["exit", "quit"]
-                question_text = input("Question (to exit, type one of {:}): ".format(EXIT_CMDS))
-
-                while question_text.strip() not in EXIT_CMDS:
-                    features = question_features(doc_tokens, question_text)
-                    eval_time_elapsed, prediction, nbest_json = inference(features, doc_tokens)
-                    print_single_query(eval_time_elapsed, prediction, nbest_json)
-                    question_text = input("Question (to exit, type one of {:}): ".format(EXIT_CMDS))

+ 0 - 117
TensorFlow/LanguageModeling/BERT/trt/perf.py

@@ -1,117 +0,0 @@
-import argparse
-import ctypes
-import numpy as np
-import tensorrt as trt
-import pycuda.driver as cuda
-import pycuda.autoinit
-
-import numpy as np
-
-TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
-
-class DeviceBuffer(object):
-    def __init__(self, shape, dtype=trt.int32):
-        self.buf = cuda.mem_alloc(trt.volume(shape) * dtype.itemsize)
-
-    def binding(self):
-        return int(self.buf)
-
-    def free(self):
-        self.buf.free()
-
-
-def main():
-    parser = argparse.ArgumentParser(description='BERT Inference Benchmark')
-    parser.add_argument("-e", "--engine", help='Path to BERT TensorRT engine')
-    parser.add_argument('-b', '--batch-size', default=[], action="append", help='Batch size(s) to benchmark. Can be specified multiple times for more than one batch size. This script assumes that the engine has been built with one optimization profile for each batch size, and that these profiles are in order of increasing batch size.', type=int)
-    parser.add_argument('-s', '--sequence-length', default=128, help='Sequence length of the BERT model', type=int)
-    parser.add_argument('-i', '--iterations', default=200, help='Number of iterations to run when benchmarking each batch size.', type=int)
-    parser.add_argument('-w', '--warm-up-runs', default=10, help='Number of iterations to run prior to benchmarking.', type=int)
-    parser.add_argument('-r', '--random-seed', required=False, default=12345, help='Random seed.', type=int)
-    args, _ = parser.parse_known_args()
-    args.batch_size = args.batch_size or [1]
-
-    # Import necessary plugins for BERT TensorRT
-    ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
-
-    with open(args.engine, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
-        # Allocate buffers large enough to store the largest batch size
-        max_input_shape = (args.sequence_length, max(args.batch_size))
-        max_output_shape = (args.sequence_length, max(args.batch_size), 2, 1, 1)
-        buffers = [
-            DeviceBuffer(max_input_shape),
-            DeviceBuffer(max_input_shape),
-            DeviceBuffer(max_input_shape),
-            DeviceBuffer(max_output_shape)
-        ]
-
-        # Prepare random input
-        pseudo_vocab_size = 30522
-        pseudo_type_vocab_size = 2
-        np.random.seed(args.random_seed)
-        test_word_ids = np.random.randint(0, pseudo_vocab_size, (args.sequence_length, max(args.batch_size)), dtype=np.int32)
-        test_segment_ids = np.random.randint(0, pseudo_type_vocab_size, (args.sequence_length, max(args.batch_size)), dtype=np.int32)
-        test_input_mask = np.ones((args.sequence_length, max(args.batch_size)), dtype=np.int32)
-
-        # Copy input h2d
-        cuda.memcpy_htod(buffers[0].buf, test_word_ids.ravel())
-        cuda.memcpy_htod(buffers[1].buf, test_segment_ids.ravel())
-        cuda.memcpy_htod(buffers[2].buf, test_input_mask.ravel())
-
-        num_binding_per_profile = engine.num_bindings // engine.num_optimization_profiles
-
-        bench_times = {}
-
-        for idx, batch_size in enumerate(sorted(args.batch_size)):
-            context.active_optimization_profile = idx
-
-            # Each profile has unique bindings
-            binding_idx_offset = idx * num_binding_per_profile
-            bindings = [0] * binding_idx_offset + [buf.binding() for buf in buffers]
-
-            shapes = {
-                "input_ids": (args.sequence_length, batch_size),
-                "segment_ids": (args.sequence_length, batch_size),
-                "input_mask": (args.sequence_length, batch_size),
-            }
-
-            for binding, shape in shapes.items():
-                context.set_binding_shape(engine[binding] + binding_idx_offset, shape)
-            assert context.all_binding_shapes_specified
-
-            # Inference
-            total_time = 0
-            start = cuda.Event()
-            end = cuda.Event()
-            stream = cuda.Stream()
-
-            # Warmup
-            for _ in range(args.warm_up_runs):
-                context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
-                stream.synchronize()
-
-            # Timing loop
-            times = []
-            for _ in range(args.iterations):
-                start.record(stream)
-                context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
-                end.record(stream)
-                stream.synchronize()
-                times.append(end.time_since(start))
-
-            # Compute average time, 95th percentile time and 99th percentile time.
-            bench_times[batch_size] = times
-
-        [b.free() for b in buffers]
-
-        for batch_size, times in bench_times.items():
-            total_time = sum(times)
-            avg_time = total_time / float(len(times))
-            times.sort()
-            percentile95 = times[int(len(times) * 0.95)]
-            percentile99 = times[int(len(times) * 0.99)]
-            print("Running {:} iterations with Batch Size: {:}\n\tTotal Time: {:} ms \tAverage Time: {:} ms\t95th Percentile Time: {:} ms\t99th Percentile Time: {:}".format(args.iterations, batch_size, total_time, avg_time, percentile95, percentile99))
-
-
-if __name__ == '__main__':
-    main()

+ 0 - 4
TensorFlow/LanguageModeling/BERT/trt/requirements.txt

@@ -1,4 +0,0 @@
-tensorflow==1.13.1
-jupyter
-numpy
-pycuda

+ 0 - 22
TensorFlow/LanguageModeling/BERT/trt/scripts/build.sh

@@ -1,22 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-SCRIPT=$(readlink -f "$0")
-SCRIPT_DIR=$(dirname ${SCRIPT})
-
-# Build the docker image using the provided Docker file
-DOCKERFILE_DIR=${SCRIPT_DIR}/../
-docker build --build-arg uid=$(id -u) --build-arg gid=$(id -g) --rm -t bert-tensorrt $DOCKERFILE_DIR

+ 0 - 61
TensorFlow/LanguageModeling/BERT/trt/scripts/download_model.sh

@@ -1,61 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Setup default parameters (if no command-line parameters given)
-VERSION='v2'
-MODEL='large'
-FT_PRECISION='fp16'
-SEQ_LEN='128'
-
-SCRIPT=$(readlink -f "$0")
-SCRIPT_DIR=$(dirname ${SCRIPT})
-TENSORRT_DIR=${SCRIPT_DIR}/../../../
-
-while test $# -gt 0
-do
-    case "$1" in
-        -h) echo "Usage: sh download_model.sh [base|large] [fp16|fp32] [128|384] [v2|v1_1]"
-            exit 0
-            ;;
-        base) MODEL='base'
-            ;;
-        large) MODEL='large'
-            ;;
-        fp16) FT_PRECISION='fp16'
-            ;;
-        fp32) FT_PRECISION='fp32'
-            ;;
-        128) SEQ_LEN='128'
-            ;;
-        384) SEQ_LEN='384'
-            ;;
-        v2) VERSION='v2'
-            ;;
-        v1_1) VERSION='v1_1'
-            ;;
-        *) echo "Invalid argument $1...exiting"
-            exit 0
-            ;;
-    esac
-    shift
-done
-
-# Download the BERT fine-tuned model
-echo "Downloading BERT-${MODEL} with fine-tuned precision ${FT_PRECISION} and sequence length ${SEQ_LEN} from NGC"
-mkdir -p /workspace/bert/models/fine-tuned
-cd /workspace/bert/models/fine-tuned
-ngc registry model download-version nvidia/bert_tf_${VERSION}_${MODEL}_${FT_PRECISION}_${SEQ_LEN}:2

+ 0 - 66
TensorFlow/LanguageModeling/BERT/trt/scripts/inference_benchmark.sh

@@ -1,66 +0,0 @@
-# Usage: run_benchmark(batch_sizes, model_variant: (base/large), precision: (fp16/fp32), sequence_length, max_batch_size)
-run_benchmark() {
-BATCH_SIZES="${1}"
-
-MODEL_VARIANT="${2}"
-PRECISION="${3}"
-SEQUENCE_LENGTH="${4}"
-MAX_BATCH="${5}"
-
-CHECKPOINTS_DIR="/workspace/bert/models/fine-tuned/bert_tf_v2_${MODEL_VARIANT}_${PRECISION}_${SEQUENCE_LENGTH}_v2"
-ENGINE_NAME="/workspace/bert/engines/bert_${MODEL_VARIANT}_${PRECISION}_bs${MAX_BATCH}_seqlen${SEQUENCE_LENGTH}_benchmark.engine"
-
-echo "==== Benchmarking BERT ${MODEL_VARIANT} ${PRECISION} SEQLEN ${SEQUENCE_LENGTH} ===="
-if [ ! -f ${ENGINE_NAME} ]; then
-    if [ ! -d ${CHECKPOINTS_DIR} ]; then
-        echo "Downloading checkpoints: scripts/download_model.sh ${MODEL_VARIANT} ${PRECISION} ${SEQUENCE_LENGTH}"
-        scripts/download_model.sh "${MODEL_VARIANT}" "${PRECISION}" "${SEQUENCE_LENGTH}"
-    fi;
-
-    echo "Building engine: python3 builder.py -m ${CHECKPOINTS_DIR}/model.ckpt-8144 -o ${ENGINE_NAME} ${BATCH_SIZES} -s ${SEQUENCE_LENGTH} --${PRECISION} -c ${CHECKPOINTS_DIR}"
-    python3 builder.py -m ${CHECKPOINTS_DIR}/model.ckpt-8144 -o ${ENGINE_NAME} ${BATCH_SIZES} -s ${SEQUENCE_LENGTH} --${PRECISION} -c ${CHECKPOINTS_DIR}
-fi;
-
-python3 perf.py ${BATCH_SIZES} -s ${SEQUENCE_LENGTH} -e ${ENGINE_NAME}
-echo
-}
-
-mkdir -p /workspace/bert/engines
-
-# BERT BASE
-## FP16
-run_benchmark "-b 1 -b 2 -b 4 -b 8 -b 12 -b 16 -b 24 -b 32" "base" "fp16" "128" "32"
-run_benchmark "-b 64" "base" "fp16" "128" "64"
-run_benchmark "-b 128" "base" "fp16" "128" "128"
-
-run_benchmark "-b 1 -b 2 -b 4 -b 8 -b 12 -b 16 -b 24 -b 32" "base" "fp16" "384" "32"
-run_benchmark "-b 64" "base" "fp16" "384" "64"
-run_benchmark "-b 128" "base" "fp16" "384" "128"
-
-## FP32
-run_benchmark "-b 1 -b 2 -b 4 -b 8 -b 12 -b 16 -b 24 -b 32" "base" "fp32" "128" "32"
-run_benchmark "-b 64" "base" "fp32" "128" "64"
-run_benchmark "-b 128" "base" "fp32" "128" "128"
-
-run_benchmark "-b 1 -b 2 -b 4 -b 8 -b 12 -b 16 -b 24 -b 32" "base" "fp32" "384" "32"
-run_benchmark "-b 64" "base" "fp32" "384" "64"
-run_benchmark "-b 128" "base" "fp32" "384" "128"
-
-# BERT LARGE
-## FP16
-run_benchmark "-b 1 -b 2 -b 4 -b 8 -b 12 -b 16 -b 24 -b 32" "large" "fp16" "128" "32"
-run_benchmark "-b 64" "large" "fp16" "128" "64"
-run_benchmark "-b 128" "large" "fp16" "128" "128"
-
-run_benchmark "-b 1 -b 2 -b 4 -b 8 -b 12 -b 16 -b 24 -b 32" "large" "fp16" "384" "32"
-run_benchmark "-b 64" "large" "fp16" "384" "64"
-run_benchmark "-b 128" "large" "fp16" "384" "128"
-
-## FP32
-run_benchmark "-b 1 -b 2 -b 4 -b 8 -b 12 -b 16 -b 24 -b 32" "large" "fp32" "128" "32"
-run_benchmark "-b 64" "large" "fp32" "128" "64"
-run_benchmark "-b 128" "large" "fp32" "128" "128"
-
-run_benchmark "-b 1 -b 2 -b 4 -b 8 -b 12 -b 16 -b 24 -b 32" "large" "fp32" "384" "32"
-run_benchmark "-b 64" "large" "fp32" "384" "64"
-run_benchmark "-b 128" "large" "fp32" "384" "128"

+ 0 - 31
TensorFlow/LanguageModeling/BERT/trt/scripts/launch.sh

@@ -1,31 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-SCRIPT=$(readlink -f "$0")
-SCRIPT_DIR=$(dirname ${SCRIPT})
-
-DOCKERFILE_DIR=${SCRIPT_DIR}/../
-# Launch the docker container based on the image we just created
-docker run -it --rm \
-    --name bert-tensorrt \
-    --runtime=nvidia \
-    --shm-size=1g \
-    --ulimit memlock=1 \
-    --ulimit stack=67108864 \
-    --publish 0.0.0.0:8888:8888 \
-    -u $(id -u):$(id -g) \
-    -v ${DOCKERFILE_DIR}:/workspace/bert \
-    bert-tensorrt

+ 14 - 0
TensorFlow/LanguageModeling/BERT/utils/utils.py

@@ -13,6 +13,20 @@
 
 import tensorflow as tf
 import time
+import os
+
+def setup_xla_flags():
+  # causes memory fragmentation for bert leading to OOM
+  if os.environ.get("TF_XLA_FLAGS", None) is not None:
+    try:
+      os.environ["TF_XLA_FLAGS"] += " --tf_xla_enable_lazy_compilation=false"
+    except: #mpi 4.0.2 causes syntax error for =
+      os.environ["TF_XLA_FLAGS"] += " --tf_xla_enable_lazy_compilation false"
+  else:
+    try:
+      os.environ["TF_XLA_FLAGS"] = " --tf_xla_enable_lazy_compilation=false"
+    except:
+      os.environ["TF_XLA_FLAGS"] = " --tf_xla_enable_lazy_compilation false"
 
 # report latency and throughput during eval
 class LogEvalRunHook(tf.estimator.SessionRunHook):