il y a 6 ans · 4f0f43b9a5
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc.
 
				 - __GNMT__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Translation/GNMT)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Translation/GNMT)]
			
 
				 - __Transformer__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Translation/Transformer)]
			
 
				 - __BERT__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT)]
			
 
				-- __Transformer-XL__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/Transformer-XL)]
			
 
				+- __Transformer-XL__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/Transformer-XL)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/Transformer-XL)]
			
 
				 
			
 
				 
			
 
				 ### Recommender Systems
			
@@ -79,6 +79,7 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc.
 
				 | [SSD320 v1.2](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Detection/SSD) | TensorFlow  | N/A  | Yes  | Yes  | -  | -  | -  | -  | -  |
			
 
				 | [BERT](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT) |TensorFlow  | N/A  | Yes  | Yes  | Yes  | Yes  | -  | [Yes](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT/trtis)  | Yes  |
			
 
				 | [BioBert](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT/biobert) | TensorFlow  | N/A  | Yes  | Yes  | -  | -  | -  | -  | -  |
			
 
				+| [Transformer-XL](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/Transformer-XL) |TensorFlow  | N/A  | Yes  | Yes  | -  | -  |   -  | -  | -  |
			
 
				 | [Neural Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/NCF) |TensorFlow  | N/A  | Yes  | Yes  | -  | -  | -  | -  | -  |
			
 
				 | [Variational Autoencoder Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/VAE-CF) |TensorFlow  | N/A  | Yes  | Yes  | -  | -  |   -  | -  | -  |
			
 
				 | [WideAndDeep](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/WideAndDeep) | TensorFlow  | N/A  | Yes  | Yes  | -  | -  |   -  | -  | -  |
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/Dockerfile
+++ b/TensorFlow/LanguageModeling/Transformer-XL/Dockerfile
@@ -0,0 +1,7 @@
 
				+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.12-tf1-py3
			
 
				+FROM ${FROM_IMAGE_NAME}
			
 
				+
			
 
				+WORKDIR /workspace/transformer-xl/tf
			
 
				+RUN pip --no-cache-dir --no-cache install 'git+https://github.com/NVIDIA/dllogger'
			
 
				+
			
 
				+ADD tf/ /workspace/transformer-xl/tf
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/LICENSE
+++ b/TensorFlow/LanguageModeling/Transformer-XL/LICENSE
@@ -0,0 +1,201 @@
 
				+                                 Apache License
			
 
				+                           Version 2.0, January 2004
			
 
				+                        http://www.apache.org/licenses/
			
 
				+
			
 
				+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
			
 
				+
			
 
				+   1. Definitions.
			
 
				+
			
 
				+      "License" shall mean the terms and conditions for use, reproduction,
			
 
				+      and distribution as defined by Sections 1 through 9 of this document.
			
 
				+
			
 
				+      "Licensor" shall mean the copyright owner or entity authorized by
			
 
				+      the copyright owner that is granting the License.
			
 
				+
			
 
				+      "Legal Entity" shall mean the union of the acting entity and all
			
 
				+      other entities that control, are controlled by, or are under common
			
 
				+      control with that entity. For the purposes of this definition,
			
 
				+      "control" means (i) the power, direct or indirect, to cause the
			
 
				+      direction or management of such entity, whether by contract or
			
 
				+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
			
 
				+      outstanding shares, or (iii) beneficial ownership of such entity.
			
 
				+
			
 
				+      "You" (or "Your") shall mean an individual or Legal Entity
			
 
				+      exercising permissions granted by this License.
			
 
				+
			
 
				+      "Source" form shall mean the preferred form for making modifications,
			
 
				+      including but not limited to software source code, documentation
			
 
				+      source, and configuration files.
			
 
				+
			
 
				+      "Object" form shall mean any form resulting from mechanical
			
 
				+      transformation or translation of a Source form, including but
			
 
				+      not limited to compiled object code, generated documentation,
			
 
				+      and conversions to other media types.
			
 
				+
			
 
				+      "Work" shall mean the work of authorship, whether in Source or
			
 
				+      Object form, made available under the License, as indicated by a
			
 
				+      copyright notice that is included in or attached to the work
			
 
				+      (an example is provided in the Appendix below).
			
 
				+
			
 
				+      "Derivative Works" shall mean any work, whether in Source or Object
			
 
				+      form, that is based on (or derived from) the Work and for which the
			
 
				+      editorial revisions, annotations, elaborations, or other modifications
			
 
				+      represent, as a whole, an original work of authorship. For the purposes
			
 
				+      of this License, Derivative Works shall not include works that remain
			
 
				+      separable from, or merely link (or bind by name) to the interfaces of,
			
 
				+      the Work and Derivative Works thereof.
			
 
				+
			
 
				+      "Contribution" shall mean any work of authorship, including
			
 
				+      the original version of the Work and any modifications or additions
			
 
				+      to that Work or Derivative Works thereof, that is intentionally
			
 
				+      submitted to Licensor for inclusion in the Work by the copyright owner
			
 
				+      or by an individual or Legal Entity authorized to submit on behalf of
			
 
				+      the copyright owner. For the purposes of this definition, "submitted"
			
 
				+      means any form of electronic, verbal, or written communication sent
			
 
				+      to the Licensor or its representatives, including but not limited to
			
 
				+      communication on electronic mailing lists, source code control systems,
			
 
				+      and issue tracking systems that are managed by, or on behalf of, the
			
 
				+      Licensor for the purpose of discussing and improving the Work, but
			
 
				+      excluding communication that is conspicuously marked or otherwise
			
 
				+      designated in writing by the copyright owner as "Not a Contribution."
			
 
				+
			
 
				+      "Contributor" shall mean Licensor and any individual or Legal Entity
			
 
				+      on behalf of whom a Contribution has been received by Licensor and
			
 
				+      subsequently incorporated within the Work.
			
 
				+
			
 
				+   2. Grant of Copyright License. Subject to the terms and conditions of
			
 
				+      this License, each Contributor hereby grants to You a perpetual,
			
 
				+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				+      copyright license to reproduce, prepare Derivative Works of,
			
 
				+      publicly display, publicly perform, sublicense, and distribute the
			
 
				+      Work and such Derivative Works in Source or Object form.
			
 
				+
			
 
				+   3. Grant of Patent License. Subject to the terms and conditions of
			
 
				+      this License, each Contributor hereby grants to You a perpetual,
			
 
				+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				+      (except as stated in this section) patent license to make, have made,
			
 
				+      use, offer to sell, sell, import, and otherwise transfer the Work,
			
 
				+      where such license applies only to those patent claims licensable
			
 
				+      by such Contributor that are necessarily infringed by their
			
 
				+      Contribution(s) alone or by combination of their Contribution(s)
			
 
				+      with the Work to which such Contribution(s) was submitted. If You
			
 
				+      institute patent litigation against any entity (including a
			
 
				+      cross-claim or counterclaim in a lawsuit) alleging that the Work
			
 
				+      or a Contribution incorporated within the Work constitutes direct
			
 
				+      or contributory patent infringement, then any patent licenses
			
 
				+      granted to You under this License for that Work shall terminate
			
 
				+      as of the date such litigation is filed.
			
 
				+
			
 
				+   4. Redistribution. You may reproduce and distribute copies of the
			
 
				+      Work or Derivative Works thereof in any medium, with or without
			
 
				+      modifications, and in Source or Object form, provided that You
			
 
				+      meet the following conditions:
			
 
				+
			
 
				+      (a) You must give any other recipients of the Work or
			
 
				+          Derivative Works a copy of this License; and
			
 
				+
			
 
				+      (b) You must cause any modified files to carry prominent notices
			
 
				+          stating that You changed the files; and
			
 
				+
			
 
				+      (c) You must retain, in the Source form of any Derivative Works
			
 
				+          that You distribute, all copyright, patent, trademark, and
			
 
				+          attribution notices from the Source form of the Work,
			
 
				+          excluding those notices that do not pertain to any part of
			
 
				+          the Derivative Works; and
			
 
				+
			
 
				+      (d) If the Work includes a "NOTICE" text file as part of its
			
 
				+          distribution, then any Derivative Works that You distribute must
			
 
				+          include a readable copy of the attribution notices contained
			
 
				+          within such NOTICE file, excluding those notices that do not
			
 
				+          pertain to any part of the Derivative Works, in at least one
			
 
				+          of the following places: within a NOTICE text file distributed
			
 
				+          as part of the Derivative Works; within the Source form or
			
 
				+          documentation, if provided along with the Derivative Works; or,
			
 
				+          within a display generated by the Derivative Works, if and
			
 
				+          wherever such third-party notices normally appear. The contents
			
 
				+          of the NOTICE file are for informational purposes only and
			
 
				+          do not modify the License. You may add Your own attribution
			
 
				+          notices within Derivative Works that You distribute, alongside
			
 
				+          or as an addendum to the NOTICE text from the Work, provided
			
 
				+          that such additional attribution notices cannot be construed
			
 
				+          as modifying the License.
			
 
				+
			
 
				+      You may add Your own copyright statement to Your modifications and
			
 
				+      may provide additional or different license terms and conditions
			
 
				+      for use, reproduction, or distribution of Your modifications, or
			
 
				+      for any such Derivative Works as a whole, provided Your use,
			
 
				+      reproduction, and distribution of the Work otherwise complies with
			
 
				+      the conditions stated in this License.
			
 
				+
			
 
				+   5. Submission of Contributions. Unless You explicitly state otherwise,
			
 
				+      any Contribution intentionally submitted for inclusion in the Work
			
 
				+      by You to the Licensor shall be under the terms and conditions of
			
 
				+      this License, without any additional terms or conditions.
			
 
				+      Notwithstanding the above, nothing herein shall supersede or modify
			
 
				+      the terms of any separate license agreement you may have executed
			
 
				+      with Licensor regarding such Contributions.
			
 
				+
			
 
				+   6. Trademarks. This License does not grant permission to use the trade
			
 
				+      names, trademarks, service marks, or product names of the Licensor,
			
 
				+      except as required for reasonable and customary use in describing the
			
 
				+      origin of the Work and reproducing the content of the NOTICE file.
			
 
				+
			
 
				+   7. Disclaimer of Warranty. Unless required by applicable law or
			
 
				+      agreed to in writing, Licensor provides the Work (and each
			
 
				+      Contributor provides its Contributions) on an "AS IS" BASIS,
			
 
				+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
			
 
				+      implied, including, without limitation, any warranties or conditions
			
 
				+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
			
 
				+      PARTICULAR PURPOSE. You are solely responsible for determining the
			
 
				+      appropriateness of using or redistributing the Work and assume any
			
 
				+      risks associated with Your exercise of permissions under this License.
			
 
				+
			
 
				+   8. Limitation of Liability. In no event and under no legal theory,
			
 
				+      whether in tort (including negligence), contract, or otherwise,
			
 
				+      unless required by applicable law (such as deliberate and grossly
			
 
				+      negligent acts) or agreed to in writing, shall any Contributor be
			
 
				+      liable to You for damages, including any direct, indirect, special,
			
 
				+      incidental, or consequential damages of any character arising as a
			
 
				+      result of this License or out of the use or inability to use the
			
 
				+      Work (including but not limited to damages for loss of goodwill,
			
 
				+      work stoppage, computer failure or malfunction, or any and all
			
 
				+      other commercial damages or losses), even if such Contributor
			
 
				+      has been advised of the possibility of such damages.
			
 
				+
			
 
				+   9. Accepting Warranty or Additional Liability. While redistributing
			
 
				+      the Work or Derivative Works thereof, You may choose to offer,
			
 
				+      and charge a fee for, acceptance of support, warranty, indemnity,
			
 
				+      or other liability obligations and/or rights consistent with this
			
 
				+      License. However, in accepting such obligations, You may act only
			
 
				+      on Your own behalf and on Your sole responsibility, not on behalf
			
 
				+      of any other Contributor, and only if You agree to indemnify,
			
 
				+      defend, and hold each Contributor harmless for any liability
			
 
				+      incurred by, or claims asserted against, such Contributor by reason
			
 
				+      of your accepting any such warranty or additional liability.
			
 
				+
			
 
				+   END OF TERMS AND CONDITIONS
			
 
				+
			
 
				+   APPENDIX: How to apply the Apache License to your work.
			
 
				+
			
 
				+      To apply the Apache License to your work, attach the following
			
 
				+      boilerplate notice, with the fields enclosed by brackets "[]"
			
 
				+      replaced with your own identifying information. (Don't include
			
 
				+      the brackets!)  The text should be enclosed in the appropriate
			
 
				+      comment syntax for the file format. We also recommend that a
			
 
				+      file or class name and description of purpose be included on the
			
 
				+      same "printed page" as the copyright notice for easier
			
 
				+      identification within third-party archives.
			
 
				+
			
 
				+   Copyright [yyyy] [name of copyright owner]
			
 
				+
			
 
				+   Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+   you may not use this file except in compliance with the License.
			
 
				+   You may obtain a copy of the License at
			
 
				+
			
 
				+       http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+   Unless required by applicable law or agreed to in writing, software
			
 
				+   distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+   See the License for the specific language governing permissions and
			
 
				+   limitations under the License.
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/NOTICE
+++ b/TensorFlow/LanguageModeling/Transformer-XL/NOTICE
@@ -0,0 +1,9 @@
 
				+Transformer-XL for Tensorflow
			
 
				+
			
 
				+This repository includes software from https://github.com/kimiyoung/transformer-xl licensed under the Apache License 2.0.
			
 
				+
			
 
				+This repository includes software from https://github.com/salesforce/awd-lstm-lm licensed under the BSD-3-Clause license.
			
 
				+
			
 
				+This repository includes software from https://github.com/cybertronai/transformer-xl licensed under the Apache License 2.0.
			
 
				+
			
 
				+This repository includes software from https://github.com/cybertronai/pytorch-lamb licensed under the MIT license.
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/README.md
+++ b/TensorFlow/LanguageModeling/Transformer-XL/README.md
@@ -0,0 +1,945 @@
 
				+# Transformer-XL For TensorFlow
			
 
				+
			
 
				+This repository provides a script and recipe to train the Transformer-XL model
			
 
				+to achieve state-of-the-art accuracy and is tested and maintained by NVIDIA.
			
 
				+
			
 
				+## Table Of Contents
			
 
				+
			
 
				+<!-- TOC GFM -->
			
 
				+
			
 
				+* [Model overview](#model-overview)
			
 
				+  * [Model architecture](#model-architecture)
			
 
				+  * [Default configuration](#default-configuration)
			
 
				+  * [Feature support matrix](#feature-support-matrix)
			
 
				+    * [Features](#features)
			
 
				+  * [Mixed precision training](#mixed-precision-training)
			
 
				+    * [Enabling mixed precision](#enabling-mixed-precision)
			
 
				+* [Setup](#setup)
			
 
				+  * [Requirements](#requirements)
			
 
				+* [Quick Start Guide](#quick-start-guide)
			
 
				+* [Advanced](#advanced)
			
 
				+  * [Scripts and sample code](#scripts-and-sample-code)
			
 
				+  * [Parameters](#parameters)
			
 
				+  * [Command-line options](#command-line-options)
			
 
				+  * [Getting the data](#getting-the-data)
			
 
				+    * [Dataset guidelines](#dataset-guidelines)
			
 
				+    * [Multi-dataset](#multi-dataset)
			
 
				+  * [Training process](#training-process)
			
 
				+  * [Inference process](#inference-process)
			
 
				+* [Performance](#performance)
			
 
				+  * [Benchmarking](#benchmarking)
			
 
				+    * [Training performance benchmark](#training-performance-benchmark)
			
 
				+    * [Inference performance benchmark](#inference-performance-benchmark)
			
 
				+  * [Results](#results)
			
 
				+    * [Training accuracy results](#training-accuracy-results)
			
 
				+      * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g)
			
 
				+        * [Base model](#base-model)
			
 
				+      * [Training accuracy: NVIDIA DGX-2 (16x V100 32G)](#training-accuracy-nvidia-dgx-2-16x-v100-32g)
			
 
				+        * [Base model](#base-model-1)
			
 
				+      * [Training loss plot](#training-loss-plot)
			
 
				+        * [Base model](#base-model-2)
			
 
				+      * [Training stability test](#training-stability-test)
			
 
				+        * [Base model](#base-model-3)
			
 
				+    * [Training performance results](#training-performance-results)
			
 
				+      * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
			
 
				+        * [Base model](#base-model-4)
			
 
				+      * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
			
 
				+        * [Base model](#base-model-5)
			
 
				+    * [Inference performance results](#inference-performance-results)
			
 
				+      * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
			
 
				+        * [Base model](#base-model-6)
			
 
				+      * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
			
 
				+        * [Base model](#base-model-7)
			
 
				+* [Release notes](#release-notes)
			
 
				+  * [Changelog](#changelog)
			
 
				+  * [Known issues](#known-issues)
			
 
				+
			
 
				+<!-- /TOC -->
			
 
				+
			
 
				+## Model overview
			
 
				+
			
 
				+This repository provides an implementation of the Transformer-XL model in
			
 
				+[TensorFlow](https://www.tensorflow.org) from the paper [Transformer-XL: Attentive
			
 
				+Language Models Beyond a Fixed-Length
			
 
				+Context](https://arxiv.org/abs/1901.02860). Transformer-XL is a
			
 
				+transformer-based language model with a segment-level recurrence and a novel
			
 
				+relative positional encoding. Enhancements introduced in Transformer-XL help
			
 
				+capture better long-term dependencies by attending to tokens from multiple
			
 
				+previous segments.
			
 
				+
			
 
				+Our implementation is based on the
			
 
				+[codebase](https://github.com/kimiyoung/transformer-xl) published by the
			
 
				+authors of the Transformer-XL paper.
			
 
				+Our implementation uses a modified model architecture. Our
			
 
				+modifications were made to achieve better hardware utilization and to take
			
 
				+advantage of Tensor Cores. Similar modifications were also proposed in an
			
 
				+implementation available from
			
 
				+[github.com/cybertronai/transformer-xl](https://github.com/cybertronai/transformer-xl).
			
 
				+Refer to the [Model architecture](#model-architecture) section for more
			
 
				+details.
			
 
				+
			
 
				+This model is trained with mixed precision using Tensor Cores on NVIDIA Volta
			
 
				+GPUs and evaluated on Volta and Turing GPUs. Therefore, researchers can get
			
 
				+results up to 1.5x faster than training without Tensor Cores, while
			
 
				+experiencing the benefits of mixed precision training. This model is tested
			
 
				+against each NGC monthly container release to ensure consistent accuracy and
			
 
				+performance over time.
			
 
				+
			
 
				+### Model architecture
			
 
				+
			
 
				+The Transformer-XL "base" model for WikiText-103 dataset available in this
			
 
				+repository was modified to use the following hyperparameter values:
			
 
				+
			
 
				+
			
 
				+|**Hyperparameter**|**Description**|**Original setting for the base model**|**Our modification to the base model**|
			
 
				+|------------------|---------------|--------------------------------------:|--------------------------------------:|
			
 
				+| `d_model` | hidden size                                                      | 410  | 512  |
			
 
				+| `n_head`  | number of attention heads                                        | 10   | 8    |
			
 
				+| `d_head`  | size of each attention head                                      | 41   | 64   |
			
 
				+| `d_inner` | hidden size in fully-connected layers                            | 2100 | 2048 |
			
 
				+| `tgt_len` | number of tokens to predict during training                      | 150  | 192  |
			
 
				+| `mem_len` | number of tokens cached from previous iterations during training | 150  | 192  |
			
 
				+
			
 
				+Changes described above were made to align certain hyperparameters with powers
			
 
				+of two, with this modification, the model is able to achieve better hardware
			
 
				+utilization, and therefore higher training throughput.
			
 
				+
			
 
				+The following table lists the hyperparameters for the base
			
 
				+Transformer-XL model for WikiText-103 dataset available in this repository.
			
 
				+
			
 
				+| **Hyperparameter** | **Description**                                                  | **Base model** |
			
 
				+| ------------------ | ---------------------------------------------------------------- | -------------: |
			
 
				+| `n_layer`          | number of layers                                                 | 16             |
			
 
				+| `d_model`          | hidden size                                                      | 512            |
			
 
				+| `n_head`           | number of attention heads                                        | 8              |
			
 
				+| `d_head`           | size of each attention head                                      | 64             |
			
 
				+| `d_inner`          | inner hidden size in fully-connected layers                      | 2048           |
			
 
				+| `dropout`          | dropout                                                          | 0.1            |
			
 
				+| `dropatt`          | dropout after softmax in the attention                           | 0.0            |
			
 
				+| `lr`               | base learning rate                                               | 0.01           |
			
 
				+| `min_lr_ratio`     | minimum ratio learning rate (for cosine decay)                   | 0.1            |
			
 
				+| `max_step`         | number of training steps                                         | 40,000         |
			
 
				+| `warmup_step`      | number of learning rate warmup steps                             | 1,000          |
			
 
				+| `batch_size`       | training batch size                                              | 256            |
			
 
				+| `tgt_len`          | number of tokens to predict during training                      | 192            |
			
 
				+| `mem_len`          | number of tokens cached from previous iterations during training | 192            |
			
 
				+
			
 
				+
			
 
				+The Transformer-XL model addresses the limitations of vanilla transformer-based
			
 
				+language models, which are only able to use relatively short context, bounded
			
 
				+by the segment length. The Transformer-XL introduces a recurrence mechanism,
			
 
				+which is able to use a cached hidden state from previous segments. During
			
 
				+training, the context consists of a concatenation of the current segment's hidden
			
 
				+state and cached states from previous iterations. Gradients are backpropagated
			
 
				+only through the current segment, although the model is able to take advantage
			
 
				+of the extra information stored in the cache and therefore is able to model
			
 
				+long-term dependencies.
			
 
				+
			
 
				+An illustration of the recurrence mechanism taken from the [Transformer-XL
			
 
				+paper](https://arxiv.org/abs/1901.02860) is shown below.
			
 
				+![model](tf/img/model.png)
			
 
				+
			
 
				+
			
 
				+### Default configuration
			
 
				+
			
 
				+The following features were implemented in this model:
			
 
				+
			
 
				+* general
			
 
				+  * single-node, Horovod multi-GPU training
			
 
				+  * training and inference with mixed precision using Tensor Cores
			
 
				+  * automatic mixed precision training (AMP)
			
 
				+
			
 
				+* model
			
 
				+  * 16-layer base Transformer-XL model with hidden size 512, 8 attention heads,
			
 
				+    each head with hidden size 64
			
 
				+  * the model trained on
			
 
				+    [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)
			
 
				+    dataset, using word-level vocabulary and
			
 
				+    adaptive softmax
			
 
				+  * embedding weights are tied with weights in the classifier
			
 
				+
			
 
				+* training
			
 
				+  * training with [LAMB](https://arxiv.org/abs/1904.00962) optimizer, the
			
 
				+    implementation of the optimizer uses [XLA](https://www.tensorflow.org/xla), which enables
			
 
				+    the fusion of elementwise operations and accelerates the training
			
 
				+  * support for training with a gradient accumulation
			
 
				+  * base model:
			
 
				+    * linear learning rate warmup for 1,000 iterations, followed by the cosine
			
 
				+      learning rate schedule, the initial learning rate is set to 0.0, and the final
			
 
				+      learning rate is set to 0.001 (min_lr_ratio * base_lr)
			
 
				+    * training for 40,000 steps, using a batch size of 256
			
 
				+
			
 
				+* inference
			
 
				+  * support for single-GPU inference
			
 
				+  * each token is using the same size of the context from previous time steps.
			
 
				+  * base model:
			
 
				+    * target length is set to 64, length of memory is set to 640
			
 
				+    * positional embeddings are clamped after 400 time steps
			
 
				+
			
 
				+### Feature support matrix
			
 
				+
			
 
				+The following features are supported by this model:
			
 
				+
			
 
				+| **Feature** | **Transformer-XL** |
			
 
				+|:------------|-------------------:|
			
 
				+|[Automatic mixed precision (AMP)](https://nvidia.github.io/apex/amp.html) | Yes |
			
 
				+|[Horovod Multi-GPU (NCCL)](https://github.com/horovod/horovod) | Yes |
			
 
				+|[LAMB](https://arxiv.org/abs/1904.00962v3) | Yes |
			
 
				+
			
 
				+
			
 
				+#### Features
			
 
				+
			
 
				+[TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) - a 
			
 
				+tool that enables Tensor Core-accelerated training. Refer to the [Enabling
			
 
				+mixed precision](#enabling-mixed-precision) section for more details.
			
 
				+
			
 
				+[Horovod](https://github.com/horovod/horovod) - Horovod 
			
 
				+is a distributed training framework for TensorFlow, Keras, PyTorch, and MXNet.
			
 
				+The goal of Horovod is to make distributed deep learning fast and easy to use.
			
 
				+For more information about how to get started with Horovod, see the [Horovod:
			
 
				+Official repository](https://github.com/horovod/horovod).
			
 
				+
			
 
				+[Multi-GPU training with Horovod](https://github.com/horovod/horovod/#usage) - our model 
			
 
				+uses Horovod to implement efficient multi-GPU training with NCCL. For details,
			
 
				+see example sources in this repository or see the [TensorFlow
			
 
				+tutorial](https://github.com/horovod/horovod/#usage).
			
 
				+
			
 
				+[LAMB](https://arxiv.org/abs/1904.00962v3) - stands 
			
 
				+for Layerwise Adaptive Moments Based optimizer, is a large batch optimization
			
 
				+technique that helps accelerate training of deep neural networks using large
			
 
				+minibatches.
			
 
				+
			
 
				+### Mixed precision training
			
 
				+
			
 
				+Mixed precision is the combined use of different numerical precisions in a
			
 
				+computational method.
			
 
				+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
			
 
				+computational speedup by performing operations in half-precision format while
			
 
				+storing minimal information in single-precision to retain as much information
			
 
				+as possible in critical parts of the network. Since the introduction of [Tensor
			
 
				+Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing
			
 
				+architectures, significant training speedups are experienced by switching to
			
 
				+mixed precision -- up to 3x overall speedup on the most arithmetically intense
			
 
				+model architectures. Using mixed precision training previously required two
			
 
				+steps:
			
 
				+
			
 
				+1. Porting the model to use the FP16 data type where appropriate.
			
 
				+2. Manually adding loss scaling to preserve small gradient values.
			
 
				+
			
 
				+The ability to train deep learning networks with lower precision was introduced
			
 
				+in the Pascal architecture and first supported in [CUDA
			
 
				+8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
			
 
				+Learning SDK.
			
 
				+
			
 
				+For information about:
			
 
				+
			
 
				+* How to train using mixed precision, see the [Mixed Precision
			
 
				+  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
			
 
				+  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
			
 
				+  documentation.
			
 
				+* Techniques used for mixed precision training, see the [Mixed-Precision
			
 
				+  Training of Deep Neural
			
 
				+  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
			
 
				+  blog.
			
 
				+* How to access and enable AMP for TensorFlow, see [Using
			
 
				+  TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp)
			
 
				+  from the TensorFlow User Guide. 
			
 
				+
			
 
				+#### Enabling mixed precision
			
 
				+
			
 
				+Automatic Mixed Precision (AMP) for TensorFlow enables the full [mixed precision
			
 
				+methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing
			
 
				+TensorFlow model code.  AMP enables mixed precision training on Volta and Turing GPUs automatically. The TensorFlow
			
 
				+framework code makes all necessary model changes internally.
			
 
				+
			
 
				+In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximizes the use of FP16, and the
			
 
				+loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing
			
 
				+`tf.contrib` loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the
			
 
				+automatic mixed precision optimization. It accomplishes this by automatically rewriting all computation graphs with the
			
 
				+necessary operations to enable mixed precision training and automatic loss scaling.
			
 
				+
			
 
				+## Setup
			
 
				+
			
 
				+The following section lists the requirements that you need to meet in order to
			
 
				+start training the Transformer-XL model.
			
 
				+
			
 
				+### Requirements
			
 
				+
			
 
				+This repository contains `Dockerfile` which extends the TensorFlow NGC container
			
 
				+and encapsulates some dependencies. Aside from these dependencies, ensure you
			
 
				+have the following components:
			
 
				+
			
 
				+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
			
 
				+* [TensorFlow 19.12-tf1-py3](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container
			
 
				+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
			
 
				+  or [Turing](https://www.nvidia.com/pl-pl/geforce/turing/) based GPU
			
 
				+
			
 
				+For more information about how to get started with NGC containers, see the
			
 
				+following sections from the NVIDIA GPU Cloud Documentation and the Deep
			
 
				+Learning DGX Documentation:
			
 
				+
			
 
				+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html),
			
 
				+* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry),
			
 
				+* [Running TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running)
			
 
				+
			
 
				+For those unable to use the TensorFlow NGC container, to set up the required environment or create your own container,
			
 
				+see the versioned [NVIDIA Container Support
			
 
				+Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
			
 
				+
			
 
				+## Quick Start Guide
			
 
				+
			
 
				+To train your model using mixed precision with Tensor Cores or using FP32,
			
 
				+perform the following steps using the default parameters of the Transformer-XL
			
 
				+base model on the
			
 
				+[WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)
			
 
				+dataset. 
			
 
				+
			
 
				+For the specifics concerning training
			
 
				+and inference, see the [Advanced](#advanced) section.
			
 
				+
			
 
				+1. Clone the repository.
			
 
				+
			
 
				+```
			
 
				+git clone https://github.com/NVIDIA/DeepLearningExamples
			
 
				+cd DeepLearningExamples/TensorFlow/LanguageModeling/Transformer-XL
			
 
				+```
			
 
				+
			
 
				+2. Download and preprocess the dataset.
			
 
				+
			
 
				+```
			
 
				+bash getdata.sh
			
 
				+```
			
 
				+
			
 
				+3. Build the Transformer-XL TensorFlow NGC container.
			
 
				+
			
 
				+```
			
 
				+bash tf/scripts/docker/build.sh
			
 
				+```
			
 
				+
			
 
				+4. Start an interactive session in the NGC container to run training/inference.
			
 
				+
			
 
				+```
			
 
				+bash tf/scripts/docker/interactive.sh
			
 
				+```
			
 
				+
			
 
				+5. Create tfrecords before your first training/evaluation for a given batch size per GPU.
			
 
				+Use same --batch_chunk and --training_batch_size flags as in the training.
			
 
				+
			
 
				+For training on DGX-1 with gradient accumulation in 2 steps:
			
 
				+```
			
 
				+bash run_wt103_base.sh train_data --batch_chunk 2
			
 
				+```
			
 
				+
			
 
				+For single GPU training with gradient accumulation in 16 steps:
			
 
				+```
			
 
				+bash run_wt103_base.sh train_data --batch_chunk 16
			
 
				+```
			
 
				+
			
 
				+For evaluation:
			
 
				+```
			
 
				+bash run_wt103_base.sh test_data
			
 
				+```
			
 
				+
			
 
				+6. Start training.
			
 
				+
			
 
				+To start mixed precision training on 8 GPUs on DGX-1, run:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh train 8 --fp16 --batch_chunk 2
			
 
				+```
			
 
				+
			
 
				+To start FP32 training on single GPU, run:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh train 1 --batch_chunk 16
			
 
				+```
			
 
				+
			
 
				+To start mixed precision training on 16 GPUs on DGX-2, run:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh train 16 --fp16
			
 
				+```
			
 
				+
			
 
				+To start FP32 training on 16 GPUs on DGX-2, run:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh train 16
			
 
				+```
			
 
				+
			
 
				+For more information on the available options, and for an explanation of what
			
 
				+happens at the end of training, refer to the [Training
			
 
				+process](#training-process) section.
			
 
				+
			
 
				+7. Start evaluation.
			
 
				+
			
 
				+To start mixed precision inference on the test set, run:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh eval [--fp16]
			
 
				+```
			
 
				+
			
 
				+The `--fp16` flag is optional, however, if it's set, then the script
			
 
				+launches mixed precision inference with Tensor Cores. If the flag is not
			
 
				+present, then the script launches FP32 inference.
			
 
				+By default, the script is loading the checkpoint from
			
 
				+`LM-TFM/model.ckpt`, which contains the model corresponding to the
			
 
				+last checkpoint from the previous training run. The path to the
			
 
				+checkpoint can be customized by setting the `--model_dir` flag.
			
 
				+
			
 
				+For more information on the available options, refer to the [Inference
			
 
				+process](#inference-process) section.
			
 
				+
			
 
				+## Advanced
			
 
				+
			
 
				+The following sections provide greater details of the dataset, running training
			
 
				+and inference, and the training results.
			
 
				+
			
 
				+### Scripts and sample code
			
 
				+
			
 
				+* `Dockerfile`: a container with the basic set of dependencies to run
			
 
				+  Transformer-XL
			
 
				+
			
 
				+In the `tf` directory, the most important files are:
			
 
				+
			
 
				+* `data_utils.py`: data loading utilities
			
 
				+* `exp_utils.py`: utility functions for running training and benchmarking
			
 
				+* `lamb.py`: implementation of [LAMB](https://arxiv.org/abs/1904.00962)
			
 
				+  optimizer
			
 
				+* `main.py`: serves as the entry point to launch the training and inference
			
 
				+* `model.py`: implementation of the Transformer-XL model
			
 
				+* `vocabulary.py`: implementation of word-level vocabulary
			
 
				+
			
 
				+### Parameters
			
 
				+
			
 
				+The complete list of available parameters for the `tf/main.py` script contains:
			
 
				+
			
 
				+```
			
 
				+  --batch_chunk: Number of accumulation steps.
			
 
				+    (default: '1')
			
 
				+    (an integer)
			
 
				+  --clamp_len: Clamp length
			
 
				+    (default: '-1')
			
 
				+    (an integer)
			
 
				+  --clip: Gradient clipping value.
			
 
				+    (default: '0.25')
			
 
				+    (a number)
			
 
				+  --corpus_info_path: Path to corpus-info.json file.
			
 
				+    (default: '')
			
 
				+  --d_embed: Dimension of the embeddings.
			
 
				+    (default: '512')
			
 
				+    (an integer)
			
 
				+  --d_head: Dimension of each attention head.
			
 
				+    (default: '64')
			
 
				+    (an integer)
			
 
				+  --d_inner: Dimension of inner hidden size in positionwise feed-forward.
			
 
				+    (default: '2048')
			
 
				+    (an integer)
			
 
				+  --d_model: Dimension of the model.
			
 
				+    (default: '512')
			
 
				+    (an integer)
			
 
				+  --data_dir: Path to tf-records directory.
			
 
				+    (default: '')
			
 
				+  --div_val: Divide the embedding size by this val for each bin
			
 
				+    (default: '1')
			
 
				+    (an integer)
			
 
				+  --[no]do_eval: Whether to run eval on the dev set.
			
 
				+    (default: 'false')
			
 
				+  --[no]do_train: Whether to run training.
			
 
				+    (default: 'true')
			
 
				+  --dropatt: Attention dropout rate.
			
 
				+    (default: '0.0')
			
 
				+    (a number)
			
 
				+  --dropout: Dropout rate.
			
 
				+    (default: '0.1')
			
 
				+    (a number)
			
 
				+  --eval_batch_size: Size of valid batch.
			
 
				+    (default: '16')
			
 
				+    (an integer)
			
 
				+  --eval_ckpt_path: Checkpoint path for do_test evaluation.If set, model_dir will be ignored.If unset, will use the latest ckpt in model_dir.
			
 
				+  --eval_split: Which data split to evaluate.
			
 
				+    (default: 'valid')
			
 
				+  --[no]fp16: Whether to enable AMP ops.
			
 
				+    (default: 'false')
			
 
				+  --init: <normal|uniform>: Initialization method.
			
 
				+    (default: 'normal')
			
 
				+  --init_range: Initialization std when init is uniform.
			
 
				+    (default: '0.1')
			
 
				+    (a number)
			
 
				+  --init_std: Initialization std when init is normal.
			
 
				+    (default: '0.02')
			
 
				+    (a number)
			
 
				+  --learning_rate: Maximum learning rate.
			
 
				+    (default: '0.01')
			
 
				+    (a number)
			
 
				+  --log_interval: Number of iterations per repeat loop.
			
 
				+    (default: '100')
			
 
				+    (an integer)
			
 
				+  --max_eval_batch: Set -1 to turn off. Only used in test mode.
			
 
				+    (default: '-1')
			
 
				+    (an integer)
			
 
				+  --mem_len: Number of steps to cache
			
 
				+    (default: '192')
			
 
				+    (an integer)
			
 
				+  --min_lr_ratio: Minimum ratio learning rate.
			
 
				+    (default: '0.1')
			
 
				+    (a number)
			
 
				+  --model_dir: Estimator model_dir.
			
 
				+    (default: 'LM-TFM')
			
 
				+  --n_head: Number of attention heads.
			
 
				+    (default: '8')
			
 
				+    (an integer)
			
 
				+  --n_layer: Number of layers.
			
 
				+    (default: '16')
			
 
				+    (an integer)
			
 
				+  --num_core_per_host: Number of cores per host
			
 
				+    (default: '8')
			
 
				+    (an integer)
			
 
				+  --percentiles: percentiles for latency confidence intervals
			
 
				+    (default: '90,95,99')
			
 
				+    (a comma separated list)
			
 
				+  --proj_init_std: Initialization std for embedding projection.
			
 
				+    (default: '0.01')
			
 
				+    (a number)
			
 
				+  --[no]proj_same_dim: Project the bin with the same dimension.
			
 
				+    (default: 'true')
			
 
				+  --[no]proj_share_all_but_first: True to share all but first projs, False not to share.
			
 
				+    (default: 'false')
			
 
				+  --record_info_dir: Path to local directory containing filenames.txt.
			
 
				+    (default: '')
			
 
				+  --[no]same_length: Same length attention
			
 
				+    (default: 'false')
			
 
				+  --save_steps: number of steps for model checkpointing.
			
 
				+    (default: '5000')
			
 
				+    (an integer)
			
 
				+  --tgt_len: Number of steps to predict
			
 
				+    (default: '192')
			
 
				+    (an integer)
			
 
				+  --[no]tie_weight: Tie embedding and softmax weight.
			
 
				+    (default: 'true')
			
 
				+  --train_batch_size: Size of train batch.
			
 
				+    (default: '256')
			
 
				+    (an integer)
			
 
				+  --train_steps: Total number of training steps.
			
 
				+    (default: '40000')
			
 
				+    (an integer)
			
 
				+  --[no]untie_r: untie r_w_bias and r_r_bias
			
 
				+    (default: 'false')
			
 
				+  --warmup_steps: Number of steps for linear lr warmup.
			
 
				+    (default: '1000')
			
 
				+    (an integer)
			
 
				+```
			
 
				+
			
 
				+### Command-line options
			
 
				+
			
 
				+To see the full list of available options and their descriptions, use the `--help` command-line option.
			
 
				+For example:
			
 
				+
			
 
				+```
			
 
				+python3 main.py --help
			
 
				+```
			
 
				+
			
 
				+### Getting the data
			
 
				+
			
 
				+The Transformer-XL model was trained on the
			
 
				+[WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)
			
 
				+dataset. The WikiText-103 dataset is a collection of over 100 million tokens
			
 
				+extracted from the set of verified
			
 
				+[Good](https://en.wikipedia.org/wiki/Wikipedia:Good_articles) and
			
 
				+[Featured](https://en.wikipedia.org/wiki/Wikipedia:Featured_articles) articles
			
 
				+on Wikipedia.
			
 
				+
			
 
				+This repository contains the `getdata.sh` download script which
			
 
				+automatically downloads and extracts the training, validation and test
			
 
				+datasets. By default, data is downloaded to the `data` directory.
			
 
				+
			
 
				+In order to test with other datasets, the script needs to be customized
			
 
				+accordingly.
			
 
				+
			
 
				+#### Dataset guidelines
			
 
				+
			
 
				+The WikiText-103 dataset was already pre-tokenized with word-level tokens. The
			
 
				+dataset features a large vocabulary of 267,735 tokens and retains the original
			
 
				+case, punctuation and numbers.
			
 
				+
			
 
				+The `getdata.sh` script downloads the data, extracts the archive and renames
			
 
				+the training, validation, and test set to `train.txt`, `valid.txt`, `test.txt`
			
 
				+respectively.
			
 
				+
			
 
				+#### Multi-dataset
			
 
				+
			
 
				+Using other datasets requires changes in the `tf/data_utils.py` file:
			
 
				+* the name of the new dataset should be added to the `dataset` flag
			
 
				+* the support for the new dataset needs to be added to the `Corpus` class:
			
 
				+    names of files containing training, validation and test data, options for
			
 
				+    the tokenizer, dataset iterator and desired values of cutoffs for adaptive softmax
			
 
				+
			
 
				+The current codebase supports training with word-level vocabulary
			
 
				+(automatically generated based on the provided dataset)
			
 
				+
			
 
				+Additionally, using other datasets may require changes in some hyperparameters
			
 
				+(for example, batch size, learning rate, number of training steps,
			
 
				+and the configuration of learning rate scheduler). 
			
 
				+
			
 
				+### Training process
			
 
				+
			
 
				+The default training configuration can be launched by running the
			
 
				+`run_wt103_base.sh` script with the first argument
			
 
				+set to `train`. By default, the training results are saved to `tf/LM-TFM` directory,
			
 
				+and map to your container's `/workspace/transformer-x/tf/LM-TFM` directory;
			
 
				+this can be customized by setting the `--model_dir` parameter.
			
 
				+
			
 
				+The training script launches a single-node data-parallel training with a fixed
			
 
				+global batch size of 256, optionally with gradient accumulation to allow
			
 
				+training on configurations with less than 16 GPUs.
			
 
				+
			
 
				+**Command-line**
			
 
				+
			
 
				+You can launch training of the Transformer-XL base model on the
			
 
				+WikiText-103 dataset with the word-based vocabulary and adaptive softmax using
			
 
				+`<#GPUs>` GPUs. For example:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh train <#GPUs> [--fp16] [--batch_chunk CHUNK]
			
 
				+```
			
 
				+
			
 
				+The `--fp16` flag is optional, however, if it's set, then the script
			
 
				+launches mixed precision training with Tensor Cores; if the flag is not
			
 
				+present, then the script launches FP32 training.
			
 
				+
			
 
				+The `--batch_chunk CHUNK` parameter controls gradient accumulation. With
			
 
				+gradient accumulation, the batch size is split into `CHUNK` chunks of equal
			
 
				+size, the training script executes the forward and backward pass using each
			
 
				+chunk and then executes the optimizer using accumulated gradients.
			
 
				+
			
 
				+**Examples**
			
 
				+
			
 
				+You can launch mixed precision training of the Transformer-XL base model on the
			
 
				+WikiText-103 dataset using 16 GPUs. For example:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh train 16 --fp16 --batch_chunk 1
			
 
				+```
			
 
				+
			
 
				+The batch size per GPU is equal to the default global batch size of 256
			
 
				+divided by the product of the number of GPUs times the number of chunks. In this
			
 
				+case, batch size per GPU is equal to `256 / (16 * 1) = 16`.
			
 
				+
			
 
				+You can launch FP32 training using 8 GPUs; the batch size per GPU is equal to 16
			
 
				+(`--batch_chunk` was set to `2` because a local batch size of 32 runs out
			
 
				+of memory on a DGX-1 with Tesla V100 16G in FP32 training). For example:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh train 8 --batch_chunk 2
			
 
				+```
			
 
				+
			
 
				+A summary of the training progress is printed after every 100 training
			
 
				+iterations; this can be customized by setting the `--log_interval` parameter.
			
 
				+The summary is printed in the following format:
			
 
				+
			
 
				+```
			
 
				+step 1300 | lr 0.009998686 | loss 5.09 | pplx  162.70, bpc  7.3461, tok/s 138037
			
 
				+```
			
 
				+
			
 
				+which contains information about a current training
			
 
				+step, current learning rate, current training loss,
			
 
				+training [perplexity](https://en.wikipedia.org/wiki/Perplexity#Perplexity_per_word),
			
 
				+bits per character and throughput in tokens per second.
			
 
				+
			
 
				+
			
 
				+The script saves one checkpoint: `model.ckpt` which contains the last saved model.
			
 
				+By default, model saving is executed every
			
 
				+5000 training steps, this can be customized by setting the `--save_steps`
			
 
				+parameter.
			
 
				+
			
 
				+Evaluation (inference) benefits from longer attention sequences, therefore to
			
 
				+reproduce perplexity values reported in the [Transformer-XL
			
 
				+paper](https://arxiv.org/abs/1901.02860), it's necessary to run the final
			
 
				+evaluation with a dedicated inference script. Refer to the [Inference
			
 
				+process](#inference-process) section for more details.
			
 
				+
			
 
				+### Inference process
			
 
				+
			
 
				+Inference can be run by launching the `run_wt103_base.sh` script
			
 
				+with the first argument set to `eval`. Running
			
 
				+inference requires a pre-trained model checkpoint.
			
 
				+
			
 
				+The script supports only single-GPU inference.
			
 
				+
			
 
				+**Command-line**
			
 
				+
			
 
				+You can launch inference of the Transformer-XL base model on the
			
 
				+WikiText-103 dataset with the word-based vocabulary and adaptive softmax.
			
 
				+
			
 
				+For example:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh eval --model_dir <PATH TO THE CHECKPOINT> [--fp16]
			
 
				+```
			
 
				+
			
 
				+The `--fp16` flag is optional, however, if it's specified, then the script
			
 
				+launches inference with Tensor Cores; if the flag is not present, then the
			
 
				+script launches FP32 inference.
			
 
				+
			
 
				+**Examples**
			
 
				+
			
 
				+To launch mixed precision inference on a single GPU using a checkpoint
			
 
				+loaded from `LM-TFM/model.ckpt*`, run:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh eval --model_dir LM-TFM --fp16
			
 
				+```
			
 
				+
			
 
				+To launch FP32 inference on a single GPU using a checkpoint loaded
			
 
				+from `LM-TFM/model.ckpt*`, run:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh eval --model_dir LM-TFM
			
 
				+```
			
 
				+
			
 
				+After the execution, the script prints a summary in the following format:
			
 
				+
			
 
				+```
			
 
				+I0109 13:02:31.304439 139903273469760 main.py:440] Evaluating with: math fp16
			
 
				+INFO:tensorflow:| loss 3.15 | pplx   23.32, bpc  4.5432, tok/s   9946, ms/batch 102.84
			
 
				+```
			
 
				+
			
 
				+which contains information about loss, perplexity and execution performance on the test dataset.
			
 
				+
			
 
				+## Performance
			
 
				+
			
 
				+### Benchmarking
			
 
				+
			
 
				+The following section shows how to run benchmarks measuring the model
			
 
				+performance in training and inference modes.
			
 
				+
			
 
				+#### Training performance benchmark
			
 
				+
			
 
				+To benchmark the training performance on a specific global batch size `<BS>`,
			
 
				+with a specific number of GPUs `<#GPUs>` for a specific number of training
			
 
				+iterations `<ITER>` run:
			
 
				+
			
 
				+For the base model:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh train <#GPUs> --train_batch_size <BS> --train_steps <ITER> --log_interval 1 [--fp16] [--batch_chunk CHUNK]
			
 
				+```
			
 
				+
			
 
				+It's recommended to launch at least 1500 training steps to get a reliable
			
 
				+estimate of training performance. For more information about the available
			
 
				+options, refer to the [Training process](#training-process) section.
			
 
				+
			
 
				+The training script prints information in the following format:
			
 
				+
			
 
				+```
			
 
				+(...)
			
 
				+[1,0]<stderr>:INFO:tensorflow:step 99 | lr 0.000990000 | loss 9.22 | pplx 10069.60, bpc 13.2977, tok/s 136092
			
 
				+[1,0]<stderr>:I0109 12:18:41.333325 140403024426816 main.py:333] step 99 | lr 0.000990000 | loss 9.22 | pplx 10069.60,
			
 
				+bpc 13.2977, tok/s 136092
			
 
				+[1,0]<stderr>:INFO:tensorflow:step 100 | lr 0.001000000 | loss 9.21 | pplx 9981.87, bpc 13.2851, tok/s 135309
			
 
				+[1,0]<stderr>:I0109 12:18:41.696926 140403024426816 main.py:333] step 100 | lr 0.001000000 | loss 9.21 | pplx 9981.87,
			
 
				+bpc 13.2851, tok/s 135309
			
 
				+(...)
			
 
				+[1,0]<stderr>:INFO:tensorflow:Training throughput: 135959 tok/s
			
 
				+```
			
 
				+
			
 
				+The last two lines contain information on the
			
 
				+average training throughput measured in tokens per second.
			
 
				+
			
 
				+#### Inference performance benchmark
			
 
				+
			
 
				+The inference performance and accuracy benchmarks require a checkpoint from a
			
 
				+trained model.
			
 
				+
			
 
				+To benchmark the inference performance on a specific global batch size `<BS>`, run:
			
 
				+
			
 
				+```
			
 
				+bash run_wt103_base.sh eval --model_dir <CHECKPOINT_DIR> --eval_batch_size <BS> [--fp16]
			
 
				+```
			
 
				+
			
 
				+The inference script prints information in the following format:
			
 
				+
			
 
				+```
			
 
				+I0109 13:02:31.304439 139903273469760 main.py:440] Evaluating with: math fp16
			
 
				+INFO:tensorflow:| loss 3.15 | pplx   23.32, bpc  4.5432, tok/s   9946, ms/batch 102.84
			
 
				+```
			
 
				+
			
 
				+The output contains information on the achieved test loss and test perplexity,
			
 
				+average inference throughput (measured in tokens per second), average inference
			
 
				+latency (measured in milliseconds).
			
 
				+
			
 
				+### Results
			
 
				+
			
 
				+The following sections provide details on how we achieved our performance and
			
 
				+accuracy in training and inference.
			
 
				+
			
 
				+#### Training accuracy results
			
 
				+
			
 
				+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
			
 
				+
			
 
				+###### Base model
			
 
				+Our results were obtained by running the `tf/run_wt103_base.sh`
			
 
				+training script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-1
			
 
				+with 8x V100 16G GPUs.
			
 
				+
			
 
				+|**GPUs**|**Batch Size / GPU**|**Accuracy - FP32 (perplexity)**|**Accuracy - Mixed precision (perplexity)**|**Time to Train - FP32 (minutes)**|**Time to Train - Mixed precision (minutes)**|**Time to Train Speedup (FP32 to Mixed precision)**|
			
 
				+|-------:|-------------------:|-------------------------------:|------------------------------------------:|---------------------------------:|--------------------------------------------:|--------------------------------------------------:|
			
 
				+| 1 | 16 | 23.64 | 23.58 | 2943 | 2011 | 1.46 |
			
 
				+| 8 | 16 | 23.36 | 23.38 | 439  | 333 | 1.32 |
			
 
				+
			
 
				+##### Training accuracy: NVIDIA DGX-2 (16x V100 32G)
			
 
				+
			
 
				+###### Base model
			
 
				+
			
 
				+Our results were obtained by running the `tf/run_wt103_base.sh`
			
 
				+training script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-2
			
 
				+with 16x V100 32G GPUs.
			
 
				+
			
 
				+|**GPUs**|**Batch Size / GPU**|**Accuracy - FP32 (perplexity)**|**Accuracy - Mixed precision (perplexity)**|**Time to Train - FP32 (minutes)**|**Time to Train - Mixed precision (minutes)**|**Time to Train Speedup (FP32 to Mixed precision)**|
			
 
				+|-------:|-------------------:|-------------------------------:|------------------------------------------:|---------------------------------:|--------------------------------------------:|--------------------------------------------------:|
			
 
				+| 16 | 16 | 23.39 | 23.37 | 202 | 161 | 1.25 |
			
 
				+| 8 | 32 | 23.33 | 23.40 | 330 | 227 | 1.46 |
			
 
				+
			
 
				+
			
 
				+##### Training loss plot
			
 
				+
			
 
				+###### Base model
			
 
				+
			
 
				+![TrainingLossBase](tf/img/training_loss_base.png)
			
 
				+
			
 
				+##### Training stability test
			
 
				+
			
 
				+###### Base model
			
 
				+The Transformer-XL base model was trained for 40,000 training steps, starting
			
 
				+from 20 different initial random seeds. The training was performed in the tensorflow:19.12-tf1-py3 NGC container on
			
 
				+NVIDIA DGX-1 with 8x V100 16G GPUs.
			
 
				+After training, the models were evaluated on the test dataset. The following
			
 
				+table summarizes the final perplexity on the test set.
			
 
				+
			
 
				+|**Average perplexity**|**Standard deviation**|**Minimum**|**Maximum**|**Median**|
			
 
				+|---------------------:|---------------------:|----------:|----------:|---------:|
			
 
				+| 23.39 | 0.0878 | 23.24 | 23.58 | 23.39 |
			
 
				+
			
 
				+#### Training performance results
			
 
				+
			
 
				+##### Training performance: NVIDIA DGX-1 (8x V100 16G)
			
 
				+
			
 
				+###### Base model
			
 
				+
			
 
				+Our results were obtained by running the `tf/run_wt103_base.sh`
			
 
				+training script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-1 with 8x
			
 
				+V100 16G GPUs. Performance numbers (in tokens per second) were averaged over 2000
			
 
				+training iterations.
			
 
				+
			
 
				+|**GPUs**|**Batch Size / GPU**|**Throughput - FP32 (tok/s)**|**Throughput - Mixed precision (tok/s)**|**Throughput speedup (FP32 to Mixed precision)**|**Weak Scaling - FP32**|**Weak Scaling - Mixed precision**|
			
 
				+|-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:|
			
 
				+| 1 | 16 |  9,104 | 13,004  | 1.428 | 1.000 | 1.000 |
			
 
				+| 2 | 16 | 18,169 | 23,856  | 1.313 | 1.996 | 1.835 |
			
 
				+| 4 | 16 | 38,876 | 50,310  | 1.294 | 4.270 | 3.869 |
			
 
				+| 8 | 16 | 78,626 | 101,954 | 1.297 | 8.636 | 7.840 |
			
 
				+
			
 
				+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
			
 
				+
			
 
				+##### Training performance: NVIDIA DGX-2 (16x V100 32G)
			
 
				+
			
 
				+###### Base model
			
 
				+
			
 
				+Our results were obtained by running the `tf/run_wt103_base.sh` training
			
 
				+script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G
			
 
				+GPUs. Performance numbers (in tokens per second) were averaged over 2000
			
 
				+training iterations.
			
 
				+
			
 
				+|**GPUs**|**Batch Size / GPU**|**Throughput - FP32 (tok/s)**|**Throughput - Mixed precision (tok/s)**|**Throughput speedup (FP32 to Mixed precision)**|**Weak Scaling - FP32**|**Weak Scaling - Mixed precision**|
			
 
				+|-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:|
			
 
				+| 1  | 16 | 9,891   | 13,791  | 1.394 | 1.000  | 1.000  |
			
 
				+| 2  | 16 | 21,550  | 28,306  | 1.314 | 2.179  | 2.052  |
			
 
				+| 4  | 16 | 42,616  | 55,430  | 1.301 | 4.309  | 4.019  |
			
 
				+| 8  | 16 | 83,932  | 107,999 | 1.287 | 8.486  | 7.831  |
			
 
				+| 16 | 16 | 164,675 | 206,906 | 1.256 | 16.649 | 15.003 |
			
 
				+
			
 
				+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
			
 
				+
			
 
				+#### Inference performance results
			
 
				+
			
 
				+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
			
 
				+
			
 
				+###### Base model
			
 
				+
			
 
				+Our results were obtained by running the
			
 
				+`tf/scripts/inference_benchmark.sh` inferencing benchmarking script in the
			
 
				+tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-1 with 1x V100 16G GPU.
			
 
				+
			
 
				+The command to launch the inference performance benchmark is provided in the
			
 
				+[Inference performance benchmark](#inference-performance-benchmark) section.
			
 
				+
			
 
				+**FP16**
			
 
				+
			
 
				+|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**|
			
 
				+|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
			
 
				+|  1  | 64 | 640 | 1394.7    | 45.91  | 47.18  | 47.98  | 49.47  |
			
 
				+|  2  | 64 | 640 | 2560.9    | 50.00  | 51.30  | 52.08  | 54.94  |
			
 
				+|  4  | 64 | 640 | 4326.6    | 59.14  | 60.47  | 61.21  | 63.00  |
			
 
				+|  8  | 64 | 640 | 6621.9    | 77.29  | 78.50  | 79.01  | 81.36  |
			
 
				+| 16  | 64 | 640 | 8872.3    | 115.34 | 116.93 | 117.98 | 121.15 |
			
 
				+| 32  | 64 | 640 | 10441.9   | 196.00 | 197.94 | 199.43 | 203.96 |
			
 
				+
			
 
				+**FP32**
			
 
				+
			
 
				+|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**|
			
 
				+|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
			
 
				+|  1  | 64 | 640 | 1315.2  | 48.70  | 49.78  | 50.54  | 53.31  |
			
 
				+|  2  | 64 | 640 | 2419.2  | 52.91  | 54.17  | 54.73  | 56.13  |
			
 
				+|  4  | 64 | 640 | 4012.7  | 63.76  | 65.27  | 66.11  | 67.81  |
			
 
				+|  8  | 64 | 640 | 5650.1  | 90.56  | 91.92  | 92.47  | 94.15  |
			
 
				+| 16  | 64 | 640 | 7041.2  | 145.34 | 147.20 | 148.38 | 151.37 |
			
 
				+| 32  | 64 | 640 | 8051.3  | 254.14 | 256.58 | 257.51 | 258.39 |
			
 
				+
			
 
				+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
			
 
				+
			
 
				+##### Inference performance: NVIDIA T4
			
 
				+
			
 
				+###### Base model
			
 
				+
			
 
				+Our results were obtained by running the
			
 
				+`tf/scripts/inference_benchmark.sh` inferencing benchmarking script in the
			
 
				+tensorflow:19.12-tf1-py3 NGC container on NVIDIA T4.
			
 
				+
			
 
				+The command to launch the inference performance benchmark is provided in the
			
 
				+[Inference performance benchmark](#inference-performance-benchmark) section.
			
 
				+
			
 
				+**FP16**
			
 
				+
			
 
				+|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**|
			
 
				+|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
			
 
				+|  1  | 64 | 640 | 1053.6    | 60.75  | 61.59  | 62.02  | 63.58  |
			
 
				+|  2  | 64 | 640 | 2024.5    | 63.22  | 63.95  | 64.76  | 67.33  |
			
 
				+|  4  | 64 | 640 | 3309.7    | 77.30  | 78.33  | 78.85  | 80.12  |
			
 
				+|  8  | 64 | 640 | 4713.7    | 108.53 | 109.66 | 110.26 | 111.15 |
			
 
				+| 16  | 64 | 640 | 6075.8    | 168.40 | 169.62 | 170.28 | 171.88 |
			
 
				+| 32  | 64 | 640 | 6850.5    | 298.69 | 300.42 | 301.04 | 302.21 |
			
 
				+
			
 
				+**FP32**
			
 
				+
			
 
				+|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**|
			
 
				+|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
			
 
				+|  1  | 64 | 640 | 929.5  | 68.88  | 70.43  | 70.88  | 72.05  |
			
 
				+|  2  | 64 | 640 | 1757.6  | 72.84  | 74.30  | 75.08  | 76.62  |
			
 
				+|  4  | 64 | 640 | 2696.7  | 94.87  | 97.02  | 97.58  | 99.19  |
			
 
				+|  8  | 64 | 640 | 3561.6  | 143.65 | 145.98 | 146.96 | 148.18 |
			
 
				+| 16  | 64 | 640 | 4190.4  | 244.16 | 246.34 | 246.62 | 247.32 |
			
 
				+| 32  | 64 | 640 | 4567.7  | 447.96 | 451.19 | 452.77 | 455.32 |
			
 
				+
			
 
				+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
			
 
				+
			
 
				+## Release notes
			
 
				+
			
 
				+### Changelog
			
 
				+
			
 
				+* April 2020
			
 
				+  * Initial release
			
 
				+  * Support for FP32 and mixed precision training on NVIDIA
			
 
				+    DGX-1, NVIDIA DGX-2, and inference on NVIDIA Tesla V100 16G
			
 
				+    and NVIDIA T4
			
 
				+
			
 
				+### Known issues
			
 
				+
			
 
				+There are no known issues with this model.
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/getdata.sh
+++ b/TensorFlow/LanguageModeling/Transformer-XL/getdata.sh
@@ -0,0 +1,120 @@
 
				+# BSD 3-Clause License
			
 
				+# 
			
 
				+# Copyright (c) 2017, 
			
 
				+# All rights reserved.
			
 
				+# 
			
 
				+# Redistribution and use in source and binary forms, with or without
			
 
				+# modification, are permitted provided that the following conditions are met:
			
 
				+# 
			
 
				+# * Redistributions of source code must retain the above copyright notice, this
			
 
				+#   list of conditions and the following disclaimer.
			
 
				+# 
			
 
				+# * Redistributions in binary form must reproduce the above copyright notice,
			
 
				+#   this list of conditions and the following disclaimer in the documentation
			
 
				+#   and/or other materials provided with the distribution.
			
 
				+# 
			
 
				+# * Neither the name of the copyright holder nor the names of its
			
 
				+#   contributors may be used to endorse or promote products derived from
			
 
				+#   this software without specific prior written permission.
			
 
				+# 
			
 
				+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
			
 
				+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
			
 
				+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
			
 
				+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
			
 
				+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
			
 
				+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
			
 
				+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
			
 
				+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+
			
 
				+echo "=== Acquiring datasets ==="
			
 
				+echo "---"
			
 
				+
			
 
				+mkdir -p data
			
 
				+cd data
			
 
				+
			
 
				+if [[ ! -d 'wikitext-2' ]]; then
			
 
				+    echo "- Downloading WikiText-2 (WT2)"
			
 
				+    wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
			
 
				+    unzip -q wikitext-2-v1.zip
			
 
				+    cd wikitext-2
			
 
				+    mv wiki.train.tokens train.txt
			
 
				+    mv wiki.valid.tokens valid.txt
			
 
				+    mv wiki.test.tokens test.txt
			
 
				+    cd ..
			
 
				+fi
			
 
				+
			
 
				+echo "- Downloading WikiText-103 (WT2)"
			
 
				+if [[ ! -d 'wikitext-103' ]]; then
			
 
				+    wget --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
			
 
				+    unzip -q wikitext-103-v1.zip
			
 
				+    cd wikitext-103
			
 
				+    mv wiki.train.tokens train.txt
			
 
				+    mv wiki.valid.tokens valid.txt
			
 
				+    mv wiki.test.tokens test.txt
			
 
				+    cd ..
			
 
				+fi
			
 
				+
			
 
				+echo "- Downloading enwik8 (Character)"
			
 
				+if [[ ! -d 'enwik8' ]]; then
			
 
				+    mkdir -p enwik8
			
 
				+    cd enwik8
			
 
				+    wget --continue http://mattmahoney.net/dc/enwik8.zip
			
 
				+    wget https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py
			
 
				+    python3 prep_enwik8.py
			
 
				+    cd ..
			
 
				+fi
			
 
				+
			
 
				+echo "- Downloading text8 (Character)"
			
 
				+if [[ ! -d 'text8' ]]; then
			
 
				+    mkdir -p text8
			
 
				+    cd text8
			
 
				+    wget --continue http://mattmahoney.net/dc/text8.zip
			
 
				+    python ../../prep_text8.py
			
 
				+    cd ..
			
 
				+fi
			
 
				+
			
 
				+echo "- Downloading Penn Treebank (PTB)"
			
 
				+if [[ ! -d 'penn' ]]; then
			
 
				+    wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
			
 
				+    tar -xzf simple-examples.tgz
			
 
				+
			
 
				+    mkdir -p penn
			
 
				+    cd penn
			
 
				+    mv ../simple-examples/data/ptb.train.txt train.txt
			
 
				+    mv ../simple-examples/data/ptb.test.txt test.txt
			
 
				+    mv ../simple-examples/data/ptb.valid.txt valid.txt
			
 
				+    cd ..
			
 
				+
			
 
				+    echo "- Downloading Penn Treebank (Character)"
			
 
				+    mkdir -p pennchar
			
 
				+    cd pennchar
			
 
				+    mv ../simple-examples/data/ptb.char.train.txt train.txt
			
 
				+    mv ../simple-examples/data/ptb.char.test.txt test.txt
			
 
				+    mv ../simple-examples/data/ptb.char.valid.txt valid.txt
			
 
				+    cd ..
			
 
				+
			
 
				+    rm -rf simple-examples/
			
 
				+fi
			
 
				+
			
 
				+echo "- Downloading 1B words"
			
 
				+
			
 
				+if [[ ! -d 'one-billion-words' ]]; then
			
 
				+    mkdir -p one-billion-words
			
 
				+    cd one-billion-words
			
 
				+
			
 
				+    wget --no-proxy http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
			
 
				+    tar xzvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
			
 
				+
			
 
				+    path="1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/"
			
 
				+    cat ${path}/news.en.heldout-00000-of-00050 > valid.txt
			
 
				+    cat ${path}/news.en.heldout-00000-of-00050 > test.txt
			
 
				+
			
 
				+    wget https://github.com/rafaljozefowicz/lm/raw/master/1b_word_vocab.txt
			
 
				+
			
 
				+    cd ..
			
 
				+fi
			
 
				+
			
 
				+echo "---"
			
 
				+echo "Happy language modeling :)"
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/prep_text8.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/prep_text8.py
@@ -0,0 +1,62 @@
 
				+#!/usr/bin/env python
			
 
				+# coding=utf-8
			
 
				+
			
 
				+# BSD 3-Clause License
			
 
				+#
			
 
				+# Copyright (c) 2017,
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# Redistribution and use in source and binary forms, with or without
			
 
				+# modification, are permitted provided that the following conditions are met:
			
 
				+#
			
 
				+# * Redistributions of source code must retain the above copyright notice, this
			
 
				+#   list of conditions and the following disclaimer.
			
 
				+#
			
 
				+# * Redistributions in binary form must reproduce the above copyright notice,
			
 
				+#   this list of conditions and the following disclaimer in the documentation
			
 
				+#   and/or other materials provided with the distribution.
			
 
				+#
			
 
				+# * Neither the name of the copyright holder nor the names of its
			
 
				+#   contributors may be used to endorse or promote products derived from
			
 
				+#   this software without specific prior written permission.
			
 
				+#
			
 
				+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
			
 
				+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
			
 
				+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
			
 
				+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
			
 
				+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
			
 
				+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
			
 
				+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
			
 
				+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import zipfile
			
 
				+
			
 
				+from io import open
			
 
				+
			
 
				+if os.path.exists('train.txt'):
			
 
				+    print('Tokenized text8 already exists - skipping processing')
			
 
				+    sys.exit()
			
 
				+
			
 
				+data = zipfile.ZipFile('text8.zip').extractall()
			
 
				+data = open('text8', 'r', encoding='utf-8').read()
			
 
				+
			
 
				+print('Length of text8: {}'.format(len(data)))
			
 
				+
			
 
				+num_test_chars = 5000000
			
 
				+
			
 
				+train_data = data[: -2 * num_test_chars]
			
 
				+valid_data = data[-2 * num_test_chars: -num_test_chars]
			
 
				+test_data = data[-num_test_chars:]
			
 
				+
			
 
				+for fn, part in [('train.txt', train_data), ('valid.txt', valid_data), ('test.txt', test_data)]:
			
 
				+    print('{} will have {} bytes'.format(fn, len(part)))
			
 
				+    print('- Tokenizing...')
			
 
				+    # Change space ' ' to underscore '_'
			
 
				+    part_str = ' '.join(['_' if c == ' ' else c for c in part.strip()])
			
 
				+    print('- Writing...')
			
 
				+    f = open(fn, 'w').write(part_str)
			
 
				+    f = open(fn + '.raw', 'w', encoding='utf-8').write(part)
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/data_utils.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/data_utils.py
@@ -0,0 +1,488 @@
 
				+from __future__ import absolute_import
			
 
				+from __future__ import division
			
 
				+from __future__ import print_function
			
 
				+
			
 
				+import math
			
 
				+import os
			
 
				+from functools import partial
			
 
				+
			
 
				+from collections import Counter, OrderedDict
			
 
				+import pickle
			
 
				+import json
			
 
				+import multiprocessing as mp
			
 
				+
			
 
				+import numpy as np
			
 
				+
			
 
				+from absl import flags
			
 
				+import tensorflow as tf
			
 
				+from vocabulary import Vocab
			
 
				+
			
 
				+from tensorflow.gfile import Exists as exists
			
 
				+from tensorflow.gfile import MakeDirs as makedirs
			
 
				+from tensorflow.gfile import Glob as glob
			
 
				+
			
 
				+
			
 
				+def _preprocess(shard, train, vocab, save_dir, cutoffs, bin_sizes, bsz, tgt_len,
			
 
				+                num_core_per_host, num_shuffle):
			
 
				+  file_names = []
			
 
				+  num_batch = 0
			
 
				+
			
 
				+  path = train[shard]
			
 
				+  data_shard = vocab.encode_file(path, ordered=False, add_double_eos=True)
			
 
				+
			
 
				+  for shuffle in range(num_shuffle):
			
 
				+    basename = "train-{:03d}-{:02d}".format(shard, shuffle)
			
 
				+    print("Processing shard {} shuffle {}".format(shard, shuffle))
			
 
				+
			
 
				+    np.random.shuffle(data_shard)
			
 
				+    file_name, num_batch_shuffle = create_ordered_tfrecords(
			
 
				+        save_dir, basename, np.concatenate(data_shard), bsz, tgt_len,
			
 
				+        num_core_per_host, cutoffs, bin_sizes)
			
 
				+    file_names.append(file_name)
			
 
				+    num_batch += num_batch_shuffle
			
 
				+
			
 
				+  return file_names, num_batch
			
 
				+
			
 
				+
			
 
				+class Corpus(object):
			
 
				+  def __init__(self, path, dataset, *args, **kwargs):
			
 
				+    self.dataset = dataset
			
 
				+    self.vocab = Vocab(*args, **kwargs)
			
 
				+
			
 
				+    if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
			
 
				+      self.vocab.count_file(os.path.join(path, "train.txt"))
			
 
				+      self.vocab.count_file(os.path.join(path, "valid.txt"))
			
 
				+      self.vocab.count_file(os.path.join(path, "test.txt"))
			
 
				+    elif self.dataset == "wt103":
			
 
				+      self.vocab.count_file(os.path.join(path, "train.txt"))
			
 
				+    elif self.dataset == "lm1b":
			
 
				+      train_path_pattern = os.path.join(
			
 
				+          path, "1-billion-word-language-modeling-benchmark-r13output",
			
 
				+          "training-monolingual.tokenized.shuffled", "news.en-*")
			
 
				+      train_paths = glob(train_path_pattern)
			
 
				+
			
 
				+      # the vocab will load from file when build_vocab() is called
			
 
				+      # for train_path in sorted(train_paths):
			
 
				+      #   self.vocab.count_file(train_path, verbose=True)
			
 
				+
			
 
				+    self.vocab.build_vocab()
			
 
				+
			
 
				+    if self.dataset in ["ptb", "wt2", "wt103"]:
			
 
				+      self.train = self.vocab.encode_file(
			
 
				+          os.path.join(path, "train.txt"), ordered=True)
			
 
				+      self.valid = self.vocab.encode_file(
			
 
				+          os.path.join(path, "valid.txt"), ordered=True)
			
 
				+      self.test  = self.vocab.encode_file(
			
 
				+          os.path.join(path, "test.txt"), ordered=True)
			
 
				+    elif self.dataset in ["enwik8", "text8"]:
			
 
				+      self.train = self.vocab.encode_file(
			
 
				+          os.path.join(path, "train.txt"), ordered=True, add_eos=False)
			
 
				+      self.valid = self.vocab.encode_file(
			
 
				+          os.path.join(path, "valid.txt"), ordered=True, add_eos=False)
			
 
				+      self.test  = self.vocab.encode_file(
			
 
				+          os.path.join(path, "test.txt"), ordered=True, add_eos=False)
			
 
				+    elif self.dataset == "lm1b":
			
 
				+      self.train = train_paths
			
 
				+      valid_path = os.path.join(path, "valid.txt")
			
 
				+      test_path = valid_path
			
 
				+      self.valid = self.vocab.encode_file(
			
 
				+          valid_path, ordered=True, add_double_eos=True)
			
 
				+      self.test  = self.vocab.encode_file(
			
 
				+          test_path, ordered=True, add_double_eos=True)
			
 
				+
			
 
				+    if self.dataset == "wt103":
			
 
				+      self.cutoffs = [0, 19997, 39997, 199997] + [len(self.vocab)]
			
 
				+    elif self.dataset == "lm1b":
			
 
				+      self.cutoffs = [0, 59997, 99997, 639997] + [len(self.vocab)]
			
 
				+    else:
			
 
				+      self.cutoffs = []
			
 
				+
			
 
				+
			
 
				+  def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len,
			
 
				+                           num_core_per_host, **kwargs):
			
 
				+    FLAGS = kwargs.get('FLAGS')
			
 
				+
			
 
				+    file_names = []
			
 
				+
			
 
				+    record_name = "record_info-{}.bsz-{}.tlen-{}.json".format(
			
 
				+        split, bsz, tgt_len)
			
 
				+
			
 
				+    record_info_path = os.path.join(save_dir, record_name)
			
 
				+
			
 
				+    if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
			
 
				+      data = getattr(self, split)
			
 
				+      bin_sizes = get_bin_sizes(
			
 
				+          data, bsz // num_core_per_host, tgt_len, self.cutoffs)
			
 
				+      file_name, num_batch = create_ordered_tfrecords(
			
 
				+          save_dir, split, data, bsz, tgt_len, num_core_per_host,
			
 
				+          self.cutoffs, bin_sizes,
			
 
				+          num_passes=FLAGS.num_passes if split == 'train' else 1)
			
 
				+      file_names.append(file_name)
			
 
				+    elif self.dataset == "lm1b":
			
 
				+      bin_sizes = get_bin_sizes(
			
 
				+          self.valid, bsz // num_core_per_host, tgt_len, self.cutoffs)
			
 
				+      if split == "train":
			
 
				+        np.random.seed(123456)
			
 
				+        num_batch = 0
			
 
				+
			
 
				+        if FLAGS.num_procs > 1:
			
 
				+          _preprocess_wrapper = partial(_preprocess,
			
 
				+              train=self.train, vocab=self.vocab, save_dir=save_dir,
			
 
				+              cutoffs=self.cutoffs, bin_sizes=bin_sizes, bsz=bsz,
			
 
				+              tgt_len=tgt_len, num_core_per_host=num_core_per_host,
			
 
				+              num_shuffle=FLAGS.num_shuffle)
			
 
				+
			
 
				+          pool = mp.Pool(processes=FLAGS.num_procs)
			
 
				+          results = pool.map(_preprocess_wrapper, range(len(self.train)))
			
 
				+          for res in results:
			
 
				+            file_names.extend(res[0])
			
 
				+            num_batch += res[1]
			
 
				+        else:
			
 
				+          for shard, path in enumerate(self.train):
			
 
				+            data_shard = self.vocab.encode_file(path, ordered=False,
			
 
				+                                                add_double_eos=True)
			
 
				+
			
 
				+            num_shuffle = FLAGS.num_shuffle
			
 
				+
			
 
				+            for shuffle in range(num_shuffle):
			
 
				+              print("Processing shard {} shuffle {}".format(shard, shuffle))
			
 
				+              basename = "train-{:03d}-{:02d}".format(shard, shuffle)
			
 
				+              np.random.shuffle(data_shard)
			
 
				+              file_name, num_batch_ = create_ordered_tfrecords(
			
 
				+                  save_dir, basename, np.concatenate(data_shard), bsz, tgt_len,
			
 
				+                  num_core_per_host,
			
 
				+                  self.cutoffs, bin_sizes)
			
 
				+              file_names.append(file_name)
			
 
				+              num_batch += num_batch_
			
 
				+
			
 
				+      else:
			
 
				+        file_name, num_batch = create_ordered_tfrecords(
			
 
				+            save_dir, split, getattr(self, split), bsz, tgt_len,
			
 
				+            num_core_per_host,
			
 
				+            self.cutoffs, bin_sizes)
			
 
				+        file_names.append(file_name)
			
 
				+
			
 
				+    with open(record_info_path, "w") as fp:
			
 
				+      record_info = {
			
 
				+        "filenames": file_names,
			
 
				+        "bin_sizes": bin_sizes,
			
 
				+        "num_batch": num_batch
			
 
				+      }
			
 
				+      json.dump(record_info, fp)
			
 
				+
			
 
				+
			
 
				+def get_bin_sizes(data, batch_size, tgt_len, cutoffs, std_mult=[2.5, 2.5, 2.5]):
			
 
				+  """
			
 
				+    Note: the `batch_size` here should be per-core batch size
			
 
				+  """
			
 
				+  bin_sizes = []
			
 
				+
			
 
				+  def _nearest_to_eight(x):
			
 
				+    y = x - x % 8
			
 
				+    return y + 8 if x % 8 >= 4 else max(8, y)
			
 
				+
			
 
				+  if cutoffs:
			
 
				+    num_batch = len(data) // batch_size // tgt_len
			
 
				+
			
 
				+    data = data[:batch_size * num_batch * tgt_len]
			
 
				+    data = data.reshape(batch_size, num_batch, tgt_len)
			
 
				+
			
 
				+    tot = batch_size * tgt_len
			
 
				+    for b, (left, right) in enumerate(zip(cutoffs[1:-1], cutoffs[2:])):
			
 
				+      mask = (data >= left) * (data < right)
			
 
				+      percents = mask.astype(np.float64).sum(2).sum(0) / tot
			
 
				+      mean = np.mean(percents)
			
 
				+      std = np.std(percents)
			
 
				+
			
 
				+      bin_size = int(math.ceil(tgt_len * batch_size * (mean + std_mult[b] * std)))
			
 
				+      bin_size = _nearest_to_eight(bin_size)
			
 
				+      bin_sizes.append(bin_size)
			
 
				+
			
 
				+  return bin_sizes
			
 
				+
			
 
				+
			
 
				+def _int64_feature(values):
			
 
				+  return tf.train.Feature(int64_list=tf.train.Int64List(value=values))
			
 
				+
			
 
				+def _float_feature(values):
			
 
				+  return tf.train.Feature(float_list=tf.train.FloatList(value=values))
			
 
				+
			
 
				+def batchify(data, batch_size, num_passes):
			
 
				+  """
			
 
				+    if num_passes > 1
			
 
				+
			
 
				+    Here, we use multiple randomly shifted copies.
			
 
				+  """
			
 
				+  if num_passes > 1:
			
 
				+    data_len = len(data)
			
 
				+    double_data = np.concatenate([data, data])
			
 
				+    data_list = []
			
 
				+    for i in range(num_passes):
			
 
				+      start = np.random.randint(0, data_len)
			
 
				+      data_list.append(double_data[start:start+data_len])
			
 
				+    data = np.concatenate(data_list)
			
 
				+
			
 
				+  num_step = len(data) // batch_size
			
 
				+  data = data[:batch_size * num_step]
			
 
				+  data = data.reshape(batch_size, num_step)
			
 
				+
			
 
				+  return data
			
 
				+
			
 
				+
			
 
				+def create_ordered_tfrecords(save_dir, basename, data, batch_size, tgt_len,
			
 
				+                             num_core_per_host, cutoffs=[], bin_sizes=[],
			
 
				+                             num_passes=1):
			
 
				+
			
 
				+  file_name = "{}.bsz-{}.tlen-{}.tfrecords".format(
			
 
				+      basename, batch_size, tgt_len)
			
 
				+
			
 
				+  save_path = os.path.join(save_dir, file_name)
			
 
				+  record_writer = tf.python_io.TFRecordWriter(save_path)
			
 
				+
			
 
				+  batched_data = batchify(data, batch_size, num_passes)
			
 
				+
			
 
				+  num_batch = 0
			
 
				+  for t in range(0, batched_data.shape[1] - 1, tgt_len):
			
 
				+    cur_tgt_len = min(batched_data.shape[1] - 1 - t, tgt_len)
			
 
				+    if num_batch % 500 == 0:
			
 
				+      print("  processing batch {}".format(num_batch))
			
 
				+    for idx in range(batch_size):
			
 
				+      inputs = batched_data[idx, t:t + cur_tgt_len]
			
 
				+      labels = batched_data[idx, t + 1:t + cur_tgt_len + 1]
			
 
				+
			
 
				+      # features dict
			
 
				+      feature = {
			
 
				+          "inputs": _int64_feature(inputs),
			
 
				+          "labels": _int64_feature(labels),
			
 
				+      }
			
 
				+
			
 
				+      example = tf.train.Example(features=tf.train.Features(feature=feature))
			
 
				+      record_writer.write(example.SerializeToString())
			
 
				+
			
 
				+    num_batch += 1
			
 
				+
			
 
				+  record_writer.close()
			
 
				+  print("Done writing {}. batches: {}".format(file_name, num_batch))
			
 
				+
			
 
				+  return file_name, num_batch
			
 
				+
			
 
				+
			
 
				+def get_lm_corpus(data_dir, dataset):
			
 
				+  fn = os.path.join(data_dir, "cache.pkl")
			
 
				+
			
 
				+  if exists(fn):
			
 
				+    print("Loading cached dataset...")
			
 
				+    with open(fn, "rb") as fp:
			
 
				+      corpus = pickle.load(fp)
			
 
				+  else:
			
 
				+    print("Producing dataset...")
			
 
				+    kwargs = {}
			
 
				+    if dataset in ["wt103", "wt2"]:
			
 
				+      kwargs["special"] = ["<eos>"]
			
 
				+      kwargs["lower_case"] = False
			
 
				+    elif dataset == "ptb":
			
 
				+      kwargs["special"] = ["<eos>"]
			
 
				+      kwargs["lower_case"] = True
			
 
				+    elif dataset == "lm1b":
			
 
				+      kwargs["special"] = []
			
 
				+      kwargs["lower_case"] = False
			
 
				+      kwargs["vocab_file"] = os.path.join(data_dir, "1b_word_vocab.txt")
			
 
				+    elif dataset in ["enwik8", "text8"]:
			
 
				+      pass
			
 
				+
			
 
				+    corpus = Corpus(data_dir, dataset, **kwargs)
			
 
				+
			
 
				+    print("Saving dataset...")
			
 
				+    with open(fn, "wb") as fp:
			
 
				+      pickle.dump(corpus, fp, protocol=2)
			
 
				+
			
 
				+    corpus_info = {
			
 
				+      "vocab_size" : len(corpus.vocab),
			
 
				+      "cutoffs" : corpus.cutoffs,
			
 
				+      "dataset" : corpus.dataset
			
 
				+    }
			
 
				+    with open(os.path.join(data_dir, "corpus-info.json"), "w") as fp:
			
 
				+      json.dump(corpus_info, fp)
			
 
				+
			
 
				+  return corpus
			
 
				+
			
 
				+
			
 
				+def main(unused_argv):
			
 
				+  del unused_argv  # Unused
			
 
				+
			
 
				+  corpus = get_lm_corpus(FLAGS.data_dir, FLAGS.dataset)
			
 
				+
			
 
				+  save_dir = os.path.join(FLAGS.data_dir, "tfrecords")
			
 
				+  if not exists(save_dir):
			
 
				+    makedirs(save_dir)
			
 
				+
			
 
				+  # test mode
			
 
				+  if FLAGS.eval_batch_size > 0:
			
 
				+    corpus.convert_to_tfrecords("test", save_dir, FLAGS.eval_batch_size,
			
 
				+                                FLAGS.tgt_len, FLAGS.num_core_per_host,
			
 
				+                                FLAGS=FLAGS)
			
 
				+    return
			
 
				+
			
 
				+  for split, batch_size in zip(
			
 
				+      ["train", "valid"],
			
 
				+      [FLAGS.train_batch_size // FLAGS.batch_chunk, FLAGS.valid_batch_size]):
			
 
				+
			
 
				+    if batch_size <= 0: continue
			
 
				+    print("Converting {} set...".format(split))
			
 
				+    corpus.convert_to_tfrecords(split, save_dir, batch_size, FLAGS.tgt_len,
			
 
				+                                FLAGS.num_core_per_host, FLAGS=FLAGS)
			
 
				+
			
 
				+
			
 
				+def load_record_info(record_info_dir, split, per_host_bsz, tgt_len,
			
 
				+                     num_core_per_host):
			
 
				+  record_name = "record_info-{}.bsz-{}.tlen-{}.json".format(
			
 
				+      split, per_host_bsz, tgt_len)
			
 
				+
			
 
				+  record_info_path = os.path.join(record_info_dir, record_name)
			
 
				+  with open(record_info_path, "r") as fp:
			
 
				+    record_info = json.load(fp)
			
 
				+
			
 
				+  return record_info
			
 
				+
			
 
				+def get_input_fn(record_info_dir, split, per_host_bsz, tgt_len,
			
 
				+                 num_core_per_host, num_hosts=1):
			
 
				+  """Creates input function."""
			
 
				+  record_info = load_record_info(record_info_dir, split, per_host_bsz, tgt_len,
			
 
				+                                 num_core_per_host)
			
 
				+
			
 
				+  file_names = record_info["filenames"]
			
 
				+  bin_sizes = record_info["bin_sizes"]
			
 
				+  num_batch = record_info["num_batch"]
			
 
				+
			
 
				+  tf.logging.info("[{}] File names {}".format(split, file_names))
			
 
				+
			
 
				+  def input_fn(params):
			
 
				+    # per-core batch size
			
 
				+    per_core_bsz = params["batch_size"] // num_core_per_host
			
 
				+
			
 
				+    # data_dir could be a remote path, e.g., a google storage url
			
 
				+    data_dir = params["data_dir"]
			
 
				+
			
 
				+    def parser(record):
			
 
				+      # preprocess "inp_perm" and "tgt_perm"
			
 
				+      def _process_perm_feature(example, prefix):
			
 
				+        for b in range(len(bin_sizes)):
			
 
				+          cnt = example.pop("{}_cnt_{}".format(prefix, b))[0]
			
 
				+          tup = example.pop("{}_tup_{}".format(prefix, b))
			
 
				+
			
 
				+          tup = tf.reshape(
			
 
				+              tf.sparse_tensor_to_dense(tup),
			
 
				+              shape=[cnt, 2])
			
 
				+
			
 
				+          # tf.float32
			
 
				+          perm = tf.sparse_to_dense(
			
 
				+              sparse_indices=tup,
			
 
				+              output_shape=[tgt_len, bin_sizes[b]],
			
 
				+              sparse_values=1.0,
			
 
				+              default_value=0.0)
			
 
				+
			
 
				+          example["{}_perm_{}".format(prefix, b)] = perm
			
 
				+
			
 
				+      # whether allow the last batch with a potentially shorter length
			
 
				+      record_spec = {
			
 
				+          "inputs": tf.VarLenFeature(tf.int64),
			
 
				+          "labels": tf.VarLenFeature(tf.int64),
			
 
				+      }
			
 
				+
			
 
				+      # retrieve serialized example
			
 
				+      example = tf.parse_single_example(
			
 
				+          serialized=record,
			
 
				+          features=record_spec)
			
 
				+
			
 
				+      # cast int64 into int32
			
 
				+      # cast sparse to dense
			
 
				+      for key in list(example.keys()):
			
 
				+        val = example[key]
			
 
				+        if tf.keras.backend.is_sparse(val):
			
 
				+          val = tf.sparse.to_dense(val)
			
 
				+        if val.dtype == tf.int64:
			
 
				+          val = tf.to_int32(val)
			
 
				+        example[key] = val
			
 
				+
			
 
				+      return example["inputs"], example["labels"]
			
 
				+
			
 
				+    file_paths = []
			
 
				+    for file_name in file_names:
			
 
				+      file_path = os.path.join(data_dir, file_name)
			
 
				+      file_paths.append(file_path)
			
 
				+
			
 
				+    if split == "train":
			
 
				+      dataset = tf.data.Dataset.from_tensor_slices(file_paths)
			
 
				+      if len(file_paths) > 1:
			
 
				+        dataset = dataset.shuffle(len(file_paths)).repeat()
			
 
				+        dataset = tf.data.TFRecordDataset(dataset)
			
 
				+      elif num_hosts > 1:
			
 
				+        host_id = params["context"].current_host
			
 
				+        # drop the remaining batches
			
 
				+        num_batch_per_host = num_batch // num_hosts
			
 
				+
			
 
				+        my_start_sample_id = (host_id * num_batch_per_host * num_core_per_host *
			
 
				+                              per_core_bsz)
			
 
				+        my_sample_num = num_batch_per_host * num_core_per_host * per_core_bsz
			
 
				+        dataset = tf.data.TFRecordDataset(dataset).skip(
			
 
				+            my_start_sample_id).take(my_sample_num)
			
 
				+      else:
			
 
				+        dataset = tf.data.TFRecordDataset(dataset)
			
 
				+
			
 
				+      if num_core_per_host > 1:
			
 
				+        import horovod.tensorflow as hvd
			
 
				+        dataset = dataset.shard(hvd.size(), hvd.rank())
			
 
				+      dataset = dataset.map(parser).cache().repeat()
			
 
				+      dataset = dataset.batch(per_core_bsz, drop_remainder=True)
			
 
				+      dataset = dataset.prefetch(num_core_per_host * per_core_bsz)
			
 
				+    else:
			
 
				+      # do not shuffle, repeat or cache in evaluation
			
 
				+      dataset = tf.data.Dataset.from_tensor_slices(file_paths)
			
 
				+      dataset = tf.data.TFRecordDataset(dataset)
			
 
				+      dataset = dataset.map(parser)
			
 
				+      dataset = dataset.batch(per_core_bsz, drop_remainder=True)
			
 
				+
			
 
				+    return dataset
			
 
				+
			
 
				+  if split == "train" and num_hosts > 1:
			
 
				+    record_info["num_batch"] = num_batch // num_hosts
			
 
				+
			
 
				+  return input_fn, record_info
			
 
				+
			
 
				+def get_corpus_info(corpus_info_path):
			
 
				+  with open(corpus_info_path, "r") as fp:
			
 
				+    corpus_info = json.load(fp)
			
 
				+  return corpus_info
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+  FLAGS = flags.FLAGS
			
 
				+  flags.DEFINE_string("data_dir", None,
			
 
				+        help="Location of the data corpus")
			
 
				+  flags.DEFINE_enum("dataset", "wt103",
			
 
				+        ["ptb", "wt2", "wt103", "lm1b", "enwik8", "text8"],
			
 
				+        help="Dataset name.")
			
 
				+  flags.DEFINE_integer("train_batch_size", 256,
			
 
				+        help="train batch size each host")
			
 
				+  flags.DEFINE_integer("valid_batch_size", 256,
			
 
				+        help="valid batch size each host")
			
 
				+  flags.DEFINE_integer("eval_batch_size", 16,
			
 
				+        help="If > 0, enter test mode and process test set only."
			
 
				+             "Otherwise, process train and dev sets only.")
			
 
				+  flags.DEFINE_integer("tgt_len", 70,
			
 
				+        help="number of tokens to predict")
			
 
				+  flags.DEFINE_integer("max_batch", -1,
			
 
				+        help="run in debug mode")
			
 
				+  flags.DEFINE_integer("num_core_per_host", 8,
			
 
				+        help="number of GPUs per host")
			
 
				+  flags.DEFINE_bool("debug", default=False,
			
 
				+        help="Process only the first batch without shuffle for lm1b.")
			
 
				+  flags.DEFINE_integer("num_procs", 1,
			
 
				+        help="number of processes")
			
 
				+  flags.DEFINE_integer("num_passes", 10,
			
 
				+        help="number of passes")
			
 
				+  flags.DEFINE_integer("num_shuffle", 4,
			
 
				+        help="number of shuffles for lm1b")
			
 
				+  flags.DEFINE_integer("batch_chunk", 1,
			
 
				+        help="number of accumulation steps")
			
 
				+
			
 
				+  tf.app.run(main)
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/exp_utils.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/exp_utils.py
@@ -0,0 +1,56 @@
 
				+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#       http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import dllogger
			
 
				+import os
			
 
				+
			
 
				+class AverageMeter:
			
 
				+    """
			
 
				+    Computes and stores the average and current value
			
 
				+    """
			
 
				+    def __init__(self, warmup=0, keep=False):
			
 
				+        self.reset()
			
 
				+        self.warmup = warmup
			
 
				+        self.keep = keep
			
 
				+
			
 
				+    def reset(self):
			
 
				+        self.val = 0
			
 
				+        self.avg = 0
			
 
				+        self.sum = 0
			
 
				+        self.count = 0
			
 
				+        self.iters = 0
			
 
				+        self.vals = []
			
 
				+
			
 
				+    def update(self, val, n=1):
			
 
				+        self.iters += 1
			
 
				+        self.val = val
			
 
				+
			
 
				+        if self.iters > self.warmup:
			
 
				+            self.sum += val * n
			
 
				+            self.count += n
			
 
				+            self.avg = self.sum / self.count
			
 
				+            if self.keep:
			
 
				+                self.vals.append(val)
			
 
				+
			
 
				+def setup_dllogger(enabled=True, filename=os.devnull, rank=0):
			
 
				+    if enabled and rank == 0:
			
 
				+        backends = [
			
 
				+            dllogger.JSONStreamBackend(
			
 
				+                dllogger.Verbosity.VERBOSE,
			
 
				+                filename,
			
 
				+                ),
			
 
				+            ]
			
 
				+        dllogger.init(backends)
			
 
				+    else:
			
 
				+        dllogger.init([])
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/img/model.png
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/img/model.png
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/img/training_loss_base.png
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/img/training_loss_base.png
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/lamb.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/lamb.py
@@ -0,0 +1,179 @@
 
				+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#       http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# MIT License
			
 
				+#
			
 
				+# Copyright (c) 2019 cybertronai
			
 
				+#
			
 
				+# Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+# of this software and associated documentation files (the "Software"), to deal
			
 
				+# in the Software without restriction, including without limitation the rights
			
 
				+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+# copies of the Software, and to permit persons to whom the Software is
			
 
				+# furnished to do so, subject to the following conditions:
			
 
				+#
			
 
				+# The above copyright notice and this permission notice shall be included in all
			
 
				+# copies or substantial portions of the Software.
			
 
				+#
			
 
				+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+# SOFTWARE.
			
 
				+
			
 
				+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+# ==============================================================================
			
 
				+
			
 
				+import tensorflow as tf
			
 
				+from tensorflow.python.ops import array_ops
			
 
				+from tensorflow.python.ops import linalg_ops
			
 
				+from tensorflow.python.eager import context
			
 
				+from tensorflow.python.framework import ops
			
 
				+from tensorflow.python.ops import control_flow_ops
			
 
				+from tensorflow.python.ops import math_ops
			
 
				+from tensorflow.python.ops import state_ops
			
 
				+from tensorflow.python.training import optimizer
			
 
				+
			
 
				+class LAMBOptimizer(optimizer.Optimizer):
			
 
				+
			
 
				+  def __init__(self, learning_rate=0.001, wd= 0.01, beta1=0.9, beta2=0.999, epsilon=1e-6,
			
 
				+               use_locking=False, name="LAMB"):
			
 
				+
			
 
				+    super(LAMBOptimizer, self).__init__(use_locking, name)
			
 
				+    self._lr = learning_rate
			
 
				+    self._beta1 = beta1
			
 
				+    self._beta2 = beta2
			
 
				+    self._epsilon = epsilon
			
 
				+    self._wd = wd
			
 
				+
			
 
				+    # Tensor versions of the constructor arguments, created in _prepare().
			
 
				+    self._lr_t = None
			
 
				+    self._beta1_t = None
			
 
				+    self._beta2_t = None
			
 
				+    self._epsilon_t = None
			
 
				+    self._wd_t = None
			
 
				+
			
 
				+  def _get_beta_accumulators(self):
			
 
				+    with ops.init_scope():
			
 
				+      if context.executing_eagerly():
			
 
				+        graph = None
			
 
				+      else:
			
 
				+        graph = ops.get_default_graph()
			
 
				+      return (self._get_non_slot_variable("beta1_power", graph=graph),
			
 
				+              self._get_non_slot_variable("beta2_power", graph=graph))
			
 
				+
			
 
				+  def _create_slots(self, var_list):
			
 
				+    first_var = min(var_list, key=lambda x: x.name)
			
 
				+    self._create_non_slot_variable(initial_value=self._beta1,
			
 
				+                                   name="beta1_power",
			
 
				+                                   colocate_with=first_var)
			
 
				+    self._create_non_slot_variable(initial_value=self._beta2,
			
 
				+                                   name="beta2_power",
			
 
				+                                   colocate_with=first_var)
			
 
				+
			
 
				+    for v in var_list:
			
 
				+      self._zeros_slot(v, "m", self._name)
			
 
				+      self._zeros_slot(v, "v", self._name)
			
 
				+
			
 
				+  def _prepare(self):
			
 
				+    lr = self._call_if_callable(self._lr)
			
 
				+    beta1 = self._call_if_callable(self._beta1)
			
 
				+    beta2 = self._call_if_callable(self._beta2)
			
 
				+    epsilon = self._call_if_callable(self._epsilon)
			
 
				+    wd = self._call_if_callable(self._wd)
			
 
				+
			
 
				+    self._lr_t = ops.convert_to_tensor(lr, name="learning_rate")
			
 
				+    self._beta1_t = ops.convert_to_tensor(beta1, name="beta1")
			
 
				+    self._beta2_t = ops.convert_to_tensor(beta2, name="beta2")
			
 
				+    self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon")
			
 
				+    self._wd_t = ops.convert_to_tensor(wd, name="wd")
			
 
				+
			
 
				+  def _apply_dense(self, grad, var):
			
 
				+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
			
 
				+    beta1_power, beta2_power = self._get_beta_accumulators()
			
 
				+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
			
 
				+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
			
 
				+    eps = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
			
 
				+    wd_lambda = math_ops.cast(self._wd_t, var.dtype.base_dtype)
			
 
				+
			
 
				+    v = self.get_slot(var, "v")
			
 
				+    v_t = v.assign(beta2_t * v + (1. - beta2_t) * grad**2)
			
 
				+    m = self.get_slot(var, "m")
			
 
				+    m_t = m.assign(beta1_t * m + (1. - beta1_t) * grad)
			
 
				+
			
 
				+    # add l2 normalizations and set ratio
			
 
				+    r1 = tf.sqrt(tf.reduce_sum(tf.square(var)))
			
 
				+    step = m_t / (tf.sqrt(v_t) + eps) + wd_lambda * var
			
 
				+    r2 = tf.sqrt(tf.reduce_sum(tf.square(step)))
			
 
				+
			
 
				+    ratio = array_ops.where(math_ops.greater(r1, 0), array_ops.where(
			
 
				+        math_ops.greater(r2, 0), tf.minimum(r1, 10) / r2, 1.0), 1.0)
			
 
				+    var_update = state_ops.assign_sub(var, lr_t * ratio * step)
			
 
				+    return control_flow_ops.group(*[var_update, v_t, m_t])
			
 
				+
			
 
				+  def _resource_apply_dense(self, grad, var):
			
 
				+    return None
			
 
				+
			
 
				+  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
			
 
				+    beta1_power, beta2_power = self._get_beta_accumulators()
			
 
				+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
			
 
				+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
			
 
				+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
			
 
				+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
			
 
				+    # m_t = beta1 * m + (1 - beta1) * g_t
			
 
				+    m = self.get_slot(var, "m")
			
 
				+    m_scaled_g_values = grad * (1 - beta1_t)
			
 
				+    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
			
 
				+    with ops.control_dependencies([m_t]):
			
 
				+      m_t = scatter_add(m, indices, m_scaled_g_values)
			
 
				+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
			
 
				+    v = self.get_slot(var, "v")
			
 
				+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
			
 
				+    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
			
 
				+    with ops.control_dependencies([v_t]):
			
 
				+      v_t = scatter_add(v, indices, v_scaled_g_values)
			
 
				+    v_sqrt = math_ops.sqrt(v_t)
			
 
				+    step = m_t / (v_sqrt + epsilon_t)
			
 
				+    w_norm = linalg_ops.norm(var, ord=2)
			
 
				+    g_norm = linalg_ops.norm(step, ord=2)
			
 
				+    ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
			
 
				+        math_ops.greater(g_norm, 0), tf.minimum(w_norm, 10) / g_norm, 1.0), 1.0)
			
 
				+    var_update = state_ops.assign_sub(
			
 
				+        var, ratio * lr_t * step, use_locking=self._use_locking)
			
 
				+    return control_flow_ops.group(*[var_update, m_t, v_t])
			
 
				+
			
 
				+  def _apply_sparse(self, grad, var):
			
 
				+    return self._apply_sparse_shared(
			
 
				+        grad.values,
			
 
				+        var,
			
 
				+        grad.indices,
			
 
				+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
			
 
				+            x,
			
 
				+            i,
			
 
				+            v,
			
 
				+            use_locking=self._use_locking))
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/main.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/main.py
@@ -0,0 +1,510 @@
 
				+from __future__ import absolute_import
			
 
				+from __future__ import division
			
 
				+from __future__ import print_function
			
 
				+
			
 
				+import os
			
 
				+import math
			
 
				+import time
			
 
				+
			
 
				+from absl import flags
			
 
				+import absl.logging as _logging  # pylint: disable=unused-import
			
 
				+
			
 
				+import tensorflow as tf
			
 
				+import horovod.tensorflow as hvd
			
 
				+import model
			
 
				+import data_utils
			
 
				+import lamb
			
 
				+import dllogger
			
 
				+from exp_utils import AverageMeter, setup_dllogger
			
 
				+
			
 
				+import numpy as np
			
 
				+
			
 
				+flags.DEFINE_integer("num_core_per_host", default=8,
			
 
				+      help="Number of cores per host")
			
 
				+flags.DEFINE_bool('horovod', True, 'Use Horovod ')
			
 
				+# Experiment (data/checkpoint/directory) config
			
 
				+flags.DEFINE_string("raport_file", default="summary.json",
			
 
				+      help="Path to dlloger json")
			
 
				+flags.DEFINE_string("data_dir", default="",
			
 
				+      help="Path to tf-records directory.")
			
 
				+flags.DEFINE_string("record_info_dir", default="",
			
 
				+      help="Path to local directory containing filenames.txt.")
			
 
				+flags.DEFINE_string("corpus_info_path", default="",
			
 
				+      help="Path to corpus-info.json file.")
			
 
				+flags.DEFINE_string("model_dir", default="LM-TFM",
			
 
				+      help="Estimator model_dir.")
			
 
				+flags.DEFINE_bool("do_train", default=True,
			
 
				+      help="Whether to run training.")
			
 
				+flags.DEFINE_bool("do_eval", default=False,
			
 
				+      help="Whether to run eval on the dev set.")
			
 
				+flags.DEFINE_string("eval_ckpt_path", None,
			
 
				+      help="Checkpoint path for do_test evaluation."
			
 
				+           "If set, model_dir will be ignored."
			
 
				+           "If unset, will use the latest ckpt in model_dir.")
			
 
				+flags.DEFINE_bool("fp16", default=False,
			
 
				+      help="Whether to enable AMP ops.")
			
 
				+flags.DEFINE_bool("jit_optimizer", default=True,
			
 
				+      help="Whether to enable XLA on optimizer")
			
 
				+
			
 
				+# Optimization config
			
 
				+flags.DEFINE_float("learning_rate", default=0.01,
			
 
				+      help="Maximum learning rate.")
			
 
				+flags.DEFINE_float("clip", default=0.25,
			
 
				+      help="Gradient clipping value.")
			
 
				+# for cosine decay
			
 
				+flags.DEFINE_float("min_lr_ratio", default=0.1,
			
 
				+      help="Minimum ratio learning rate.")
			
 
				+flags.DEFINE_integer("warmup_steps", default=1000,
			
 
				+      help="Number of steps for linear lr warmup.")
			
 
				+
			
 
				+# Training config
			
 
				+flags.DEFINE_integer("train_batch_size", default=256,
			
 
				+      help="Size of train batch.")
			
 
				+flags.DEFINE_integer("eval_batch_size", default=16,
			
 
				+      help="Size of valid batch.")
			
 
				+flags.DEFINE_integer("train_steps", default=40000,
			
 
				+      help="Total number of training steps.")
			
 
				+flags.DEFINE_integer("log_interval", default=100,
			
 
				+      help="Number of iterations per repeat loop.")
			
 
				+flags.DEFINE_integer("save_steps", default=5000,
			
 
				+      help="number of steps for model checkpointing.")
			
 
				+flags.DEFINE_integer("batch_chunk", default=1,
			
 
				+      help="Number of accumulation steps.")
			
 
				+
			
 
				+# Evaluation config
			
 
				+flags.DEFINE_integer("max_eval_batch", default=-1,
			
 
				+      help="Set -1 to turn off. Only used in test mode.")
			
 
				+flags.DEFINE_string("eval_split", "valid",
			
 
				+      help="Which data split to evaluate.")
			
 
				+flags.DEFINE_list("percentiles", default=['90', '95', '99'],
			
 
				+      help="percentiles for latency confidence intervals")
			
 
				+
			
 
				+# Model config
			
 
				+flags.DEFINE_integer("tgt_len", default=192,
			
 
				+      help="Number of steps to predict")
			
 
				+flags.DEFINE_integer("mem_len", default=192,
			
 
				+      help="Number of steps to cache")
			
 
				+flags.DEFINE_bool("same_length", default=False,
			
 
				+      help="Same length attention")
			
 
				+flags.DEFINE_integer("clamp_len", default=-1,
			
 
				+      help="Clamp length")
			
 
				+
			
 
				+flags.DEFINE_integer("n_layer", default=16,
			
 
				+      help="Number of layers.")
			
 
				+flags.DEFINE_integer("d_model", default=512,
			
 
				+      help="Dimension of the model.")
			
 
				+flags.DEFINE_integer("d_embed", default=512,
			
 
				+      help="Dimension of the embeddings.")
			
 
				+flags.DEFINE_integer("n_head", default=8,
			
 
				+      help="Number of attention heads.")
			
 
				+flags.DEFINE_integer("d_head", default=64,
			
 
				+      help="Dimension of each attention head.")
			
 
				+flags.DEFINE_integer("d_inner", default=2048,
			
 
				+      help="Dimension of inner hidden size in positionwise feed-forward.")
			
 
				+flags.DEFINE_float("dropout", default=0.1,
			
 
				+      help="Dropout rate.")
			
 
				+flags.DEFINE_float("dropatt", default=0.0,
			
 
				+      help="Attention dropout rate.")
			
 
				+flags.DEFINE_bool("untie_r", default=False,
			
 
				+      help="untie r_w_bias and r_r_bias")
			
 
				+
			
 
				+# Adaptive Softmax / Embedding
			
 
				+flags.DEFINE_bool("tie_weight", default=True,
			
 
				+      help="Tie embedding and softmax weight.")
			
 
				+flags.DEFINE_integer("div_val", default=1,
			
 
				+      help="Divide the embedding size by this val for each bin")
			
 
				+flags.DEFINE_bool("proj_share_all_but_first", default=False,
			
 
				+      help="True to share all but first projs, False not to share.")
			
 
				+flags.DEFINE_bool("proj_same_dim", default=True,
			
 
				+      help="Project the bin with the same dimension.")
			
 
				+
			
 
				+# Parameter initialization
			
 
				+flags.DEFINE_enum("init", default="normal",
			
 
				+      enum_values=["normal", "uniform"],
			
 
				+      help="Initialization method.")
			
 
				+flags.DEFINE_float("init_std", default=0.02,
			
 
				+      help="Initialization std when init is normal.")
			
 
				+flags.DEFINE_float("proj_init_std", default=0.01,
			
 
				+      help="Initialization std for embedding projection.")
			
 
				+flags.DEFINE_float("init_range", default=0.1,
			
 
				+      help="Initialization std when init is uniform.")
			
 
				+
			
 
				+
			
 
				+FLAGS = flags.FLAGS
			
 
				+
			
 
				+def get_model_fn(n_token, cutoffs):
			
 
				+  def model_fn(inp, tgt, mems, is_training):
			
 
				+    inp = tf.transpose(inp, [1, 0])
			
 
				+    tgt = tf.transpose(tgt, [1, 0])
			
 
				+
			
 
				+    if FLAGS.init == "uniform":
			
 
				+      initializer = tf.initializers.random_uniform(
			
 
				+          minval=-FLAGS.init_range,
			
 
				+          maxval=FLAGS.init_range,
			
 
				+          seed=None)
			
 
				+    elif FLAGS.init == "normal":
			
 
				+      initializer = tf.initializers.random_normal(
			
 
				+          stddev=FLAGS.init_std,
			
 
				+          seed=None)
			
 
				+      proj_initializer = tf.initializers.random_normal(
			
 
				+          stddev=FLAGS.proj_init_std,
			
 
				+          seed=None)
			
 
				+
			
 
				+    tie_projs = [False for _ in range(len(cutoffs) + 1)]
			
 
				+    if FLAGS.proj_share_all_but_first:
			
 
				+      for i in range(1, len(tie_projs)):
			
 
				+        tie_projs[i] = True
			
 
				+
			
 
				+    loss, new_mems = model.transformer(
			
 
				+        dec_inp=inp,
			
 
				+        target=tgt,
			
 
				+        mems=mems,
			
 
				+        n_token=n_token,
			
 
				+        n_layer=FLAGS.n_layer,
			
 
				+        d_model=FLAGS.d_model,
			
 
				+        d_embed=FLAGS.d_embed,
			
 
				+        n_head=FLAGS.n_head,
			
 
				+        d_head=FLAGS.d_head,
			
 
				+        d_inner=FLAGS.d_inner,
			
 
				+        dropout=FLAGS.dropout,
			
 
				+        dropatt=FLAGS.dropatt,
			
 
				+        initializer=initializer,
			
 
				+        proj_initializer=proj_initializer,
			
 
				+        is_training=is_training,
			
 
				+        mem_len=FLAGS.mem_len,
			
 
				+        cutoffs=cutoffs,
			
 
				+        div_val=FLAGS.div_val,
			
 
				+        tie_projs=tie_projs,
			
 
				+        input_perms=None,
			
 
				+        target_perms=None,
			
 
				+        head_target=None,
			
 
				+        same_length=FLAGS.same_length,
			
 
				+        clamp_len=FLAGS.clamp_len,
			
 
				+        untie_r=FLAGS.untie_r,
			
 
				+        proj_same_dim=FLAGS.proj_same_dim)
			
 
				+
			
 
				+    # number of parameters
			
 
				+    num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
			
 
				+    tf.logging.info('#params: {}'.format(num_params))
			
 
				+
			
 
				+    if is_training:
			
 
				+      all_vars = tf.trainable_variables()
			
 
				+
			
 
				+      return loss, new_mems, all_vars
			
 
				+    else:
			
 
				+      return loss, new_mems
			
 
				+
			
 
				+  return model_fn
			
 
				+
			
 
				+
			
 
				+def single_core_graph(n_token, cutoffs, is_training, inp, tgt, mems):
			
 
				+  model_fn = get_model_fn(
			
 
				+      n_token=n_token,
			
 
				+      cutoffs=cutoffs)
			
 
				+
			
 
				+  model_ret = model_fn(
			
 
				+      inp=inp,
			
 
				+      tgt=tgt,
			
 
				+      mems=mems,
			
 
				+      is_training=is_training)
			
 
				+
			
 
				+  return model_ret
			
 
				+
			
 
				+
			
 
				+def train(n_token, cutoffs, rank, local_rank, size):
			
 
				+
			
 
				+  meters = {}
			
 
				+  warmup = 2 + 12/size
			
 
				+  meters['train_throughput'] = AverageMeter(warmup=warmup)
			
 
				+  train_batch_size = FLAGS.train_batch_size // FLAGS.batch_chunk
			
 
				+  ##### Get input function and model function
			
 
				+  train_input_fn, train_record_info = data_utils.get_input_fn(
			
 
				+      record_info_dir=FLAGS.record_info_dir,
			
 
				+      split="train",
			
 
				+      per_host_bsz=train_batch_size,
			
 
				+      tgt_len=FLAGS.tgt_len,
			
 
				+      num_core_per_host=FLAGS.num_core_per_host,
			
 
				+      num_hosts=1)
			
 
				+
			
 
				+  tf.logging.info("num of batches {}".format(train_record_info["num_batch"]))
			
 
				+
			
 
				+  ##### Create computational graph
			
 
				+  train_set = train_input_fn({
			
 
				+      "batch_size": train_batch_size,
			
 
				+      "data_dir": FLAGS.data_dir})
			
 
				+
			
 
				+  inputs, labels = train_set.make_one_shot_iterator().get_next()
			
 
				+
			
 
				+  per_core_bsz = train_batch_size // FLAGS.num_core_per_host
			
 
				+
			
 
				+  with tf.variable_scope(tf.get_variable_scope()):
			
 
				+    mems = [tf.Variable(tf.zeros([FLAGS.mem_len, per_core_bsz, FLAGS.d_model], tf.float32), trainable=False)
			
 
				+              for _ in range(FLAGS.n_layer)]
			
 
				+
			
 
				+    loss, new_mems, all_vars = single_core_graph(
			
 
				+        n_token=n_token,
			
 
				+        cutoffs=cutoffs,
			
 
				+        is_training=True,
			
 
				+        inp=inputs,
			
 
				+        tgt=labels,
			
 
				+        mems=mems)
			
 
				+
			
 
				+    assign_mems = [mems[i].assign(new_mems[i]) for i in range(FLAGS.n_layer)]
			
 
				+
			
 
				+  target_tokens = tf.size(labels)
			
 
				+
			
 
				+  ## configure the optimizer
			
 
				+  global_step = tf.train.get_or_create_global_step()
			
 
				+
			
 
				+  # warmup stage: increase the learning rate linearly
			
 
				+  if FLAGS.warmup_steps > 0:
			
 
				+    warmup_lr = tf.to_float(global_step) / tf.to_float(FLAGS.warmup_steps) \
			
 
				+                * FLAGS.learning_rate
			
 
				+  else:
			
 
				+    warmup_lr = 0.0
			
 
				+
			
 
				+  # decay stage: decay the learning rate using the cosine schedule
			
 
				+  decay_lr = tf.train.cosine_decay(
			
 
				+      FLAGS.learning_rate,
			
 
				+      global_step=global_step-FLAGS.warmup_steps,
			
 
				+      decay_steps=FLAGS.train_steps-FLAGS.warmup_steps,
			
 
				+      alpha=FLAGS.min_lr_ratio)
			
 
				+
			
 
				+  # choose warmup or decay
			
 
				+  learning_rate = tf.where(global_step < FLAGS.warmup_steps,
			
 
				+                           warmup_lr, decay_lr)
			
 
				+
			
 
				+  # get the train op
			
 
				+  optimizer = lamb.LAMBOptimizer(learning_rate=learning_rate)
			
 
				+  if FLAGS.horovod:
			
 
				+    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True)
			
 
				+  grads_and_vars = optimizer.compute_gradients(loss/FLAGS.batch_chunk, all_vars)
			
 
				+  grads, all_vars = zip(*grads_and_vars)
			
 
				+
			
 
				+  accum_vars = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in all_vars]
			
 
				+  in_progress = tf.get_variable(name="in_progress", shape=[], dtype=tf.bool, trainable=False,
			
 
				+                               initializer=tf.zeros_initializer)
			
 
				+  accum_ops = tf.cond(in_progress,
			
 
				+                      lambda: [accum_vars[i].assign_add(grad) for i, grad in enumerate(grads)],
			
 
				+                      lambda: [accum_vars[i].assign(grad) for i, grad in enumerate(grads)])
			
 
				+  with tf.control_dependencies(accum_ops + assign_mems):
			
 
				+    acc_op = in_progress.assign(tf.ones_like(in_progress))
			
 
				+  final_accum_vars = [accum_vars[i] + gv for i,gv in enumerate(grads)]
			
 
				+  acc_clipped, acc_gnorm = tf.clip_by_global_norm(final_accum_vars, FLAGS.clip)
			
 
				+  clipped, gnorm = tf.clip_by_global_norm(grads, FLAGS.clip)
			
 
				+  acc_train_op = optimizer.apply_gradients(list(zip(acc_clipped, all_vars)), global_step)
			
 
				+  grads_and_vars = list(zip(clipped, all_vars))
			
 
				+  if FLAGS.jit_optimizer:
			
 
				+    jit_scope = tf.contrib.compiler.jit.experimental_jit_scope
			
 
				+    with jit_scope():
			
 
				+      train_op = optimizer.apply_gradients(grads_and_vars, global_step)
			
 
				+  else:
			
 
				+    train_op = optimizer.apply_gradients(grads_and_vars, global_step)
			
 
				+  final_op = tf.group(train_op, assign_mems)
			
 
				+  acc_final_op = tf.group(acc_train_op, assign_mems, in_progress.assign(tf.zeros_like(in_progress)))
			
 
				+  ##### Training loop
			
 
				+  saver = tf.train.Saver()
			
 
				+
			
 
				+  gpu_options = tf.GPUOptions(allow_growth = True, visible_device_list = str(local_rank))
			
 
				+  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options = gpu_options)) as sess:
			
 
				+    sess.run(tf.global_variables_initializer())
			
 
				+    if FLAGS.horovod:
			
 
				+      sess.run(hvd.broadcast_global_variables(0))
			
 
				+
			
 
				+    accum = [acc_op, target_tokens]
			
 
				+    fetches = [loss, global_step, target_tokens, learning_rate, final_op if FLAGS.batch_chunk == 1 else acc_final_op]
			
 
				+    total_loss, prev_step, target_tokens = 0., -1, 0
			
 
				+    start_time = time.time()
			
 
				+    while True:
			
 
				+      for i in range(FLAGS.batch_chunk-1):
			
 
				+        _,tt = sess.run(accum)
			
 
				+        target_tokens += tt
			
 
				+      fetched = sess.run(fetches)
			
 
				+
			
 
				+      loss_np, curr_step, tt = fetched[:3]
			
 
				+      total_loss += loss_np
			
 
				+      target_tokens += tt
			
 
				+
			
 
				+      if curr_step > 0 and curr_step % FLAGS.log_interval == 0:
			
 
				+        curr_loss = total_loss / (curr_step - prev_step)
			
 
				+        throughput = target_tokens * size / (time.time()-start_time)
			
 
				+        meters['train_throughput'].update(throughput)
			
 
				+        if rank == 0:
			
 
				+          tf.logging.info("step {} | lr {:8.9f} "
			
 
				+                        "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}, tok/s {:>6.0f}".format(
			
 
				+                            curr_step, fetched[-2],
			
 
				+                            curr_loss, math.exp(curr_loss), curr_loss / math.log(2), throughput))
			
 
				+          dllogger_data = {
			
 
				+              'lr': fetched[-1],
			
 
				+              'train_loss': curr_loss,
			
 
				+              'train_perplexity': math.exp(curr_loss),
			
 
				+              'train_throughput': throughput,
			
 
				+          }
			
 
				+          dllogger.log(step=int(curr_step), data=dllogger_data)
			
 
				+        total_loss, prev_step, target_tokens = 0., curr_step, 0
			
 
				+        start_time = time.time()
			
 
				+
			
 
				+      if curr_step > 0 and curr_step % FLAGS.save_steps == 0 and rank == 0:
			
 
				+        save_path = os.path.join(FLAGS.model_dir, "model.ckpt")
			
 
				+        saver.save(sess, save_path)
			
 
				+        tf.logging.info("Model saved in path: {}".format(save_path))
			
 
				+
			
 
				+      if curr_step == FLAGS.train_steps:
			
 
				+        break
			
 
				+    if rank == 0:
			
 
				+      tf.logging.info("Training throughput: {:>6.0f} tok/s".format(meters['train_throughput'].avg))
			
 
				+      summary = {
			
 
				+          'train_throughput': meters['train_throughput'].avg,
			
 
				+      }
			
 
				+      dllogger.log(step=tuple(), data=summary)
			
 
				+
			
 
				+
			
 
				+
			
 
				+def evaluate(n_token, cutoffs):
			
 
				+  ##### Get input function and model function
			
 
				+  eval_input_fn, eval_record_info = data_utils.get_input_fn(
			
 
				+      record_info_dir=FLAGS.record_info_dir,
			
 
				+      split=FLAGS.eval_split,
			
 
				+      per_host_bsz=FLAGS.eval_batch_size,
			
 
				+      tgt_len=FLAGS.tgt_len,
			
 
				+      num_core_per_host=FLAGS.num_core_per_host,
			
 
				+      num_hosts=1)
			
 
				+
			
 
				+  meters = {}
			
 
				+  warmup = 2
			
 
				+  meters['eval_throughput'] = AverageMeter(warmup=warmup)
			
 
				+  meters['eval_latency'] = AverageMeter(warmup=warmup, keep=True)
			
 
				+
			
 
				+  num_batch = eval_record_info["num_batch"]
			
 
				+  if FLAGS.max_eval_batch > 0:
			
 
				+      num_batch = FLAGS.max_eval_batch
			
 
				+  tf.logging.info("num of batches {}".format(num_batch))
			
 
				+
			
 
				+  ##### Create computational graph
			
 
				+  eval_set = eval_input_fn({
			
 
				+      "batch_size": FLAGS.eval_batch_size,
			
 
				+      "data_dir": FLAGS.data_dir})
			
 
				+
			
 
				+  inputs, labels = eval_set.make_one_shot_iterator().get_next()
			
 
				+
			
 
				+  bsz = FLAGS.eval_batch_size
			
 
				+
			
 
				+  with tf.variable_scope(tf.get_variable_scope()):
			
 
				+    mems = [tf.placeholder(tf.float32,
			
 
				+                             [FLAGS.mem_len, bsz, FLAGS.d_model])
			
 
				+              for _ in range(FLAGS.n_layer)]
			
 
				+
			
 
				+    loss, new_mems = single_core_graph(
			
 
				+        n_token=n_token,
			
 
				+        cutoffs=cutoffs,
			
 
				+        is_training=False,
			
 
				+        inp=inputs,
			
 
				+        tgt=labels,
			
 
				+        mems=mems)
			
 
				+
			
 
				+  target_tokens = tf.size(labels)
			
 
				+  ##### Evaluation loop
			
 
				+  mems_np = [np.zeros([FLAGS.mem_len, bsz, FLAGS.d_model], dtype=np.float32)
			
 
				+          for layer in range(FLAGS.n_layer)]
			
 
				+
			
 
				+  saver = tf.train.Saver()
			
 
				+
			
 
				+  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
			
 
				+    sess.run(tf.global_variables_initializer())
			
 
				+
			
 
				+    if FLAGS.eval_ckpt_path is None:
			
 
				+      eval_ckpt_path = tf.train.latest_checkpoint(FLAGS.model_dir)
			
 
				+    else:
			
 
				+      eval_ckpt_path = FLAGS.eval_ckpt_path
			
 
				+    tf.logging.info("Evaluate {}".format(eval_ckpt_path))
			
 
				+    saver.restore(sess, eval_ckpt_path)
			
 
				+
			
 
				+    fetches = [loss, new_mems, target_tokens]
			
 
				+
			
 
				+    format_str = "  >> processing batch {{:{0}d}}/{{:{0}d}}".format(
			
 
				+        len(str(num_batch)))
			
 
				+
			
 
				+    total_loss, total_cnt, target_tokens = 0, 0, 0
			
 
				+    start_time = time.time()
			
 
				+    for step in range(num_batch):
			
 
				+      feed_dict = {}
			
 
				+      for m, m_np in zip(mems, mems_np):
			
 
				+        feed_dict[m] = m_np
			
 
				+
			
 
				+      fetched = sess.run(fetches, feed_dict=feed_dict)
			
 
				+
			
 
				+      loss_np, mems_np, tt = fetched
			
 
				+      target_tokens += tt
			
 
				+      cnt_np = 1
			
 
				+      total_loss += loss_np * cnt_np
			
 
				+      total_cnt += cnt_np
			
 
				+
			
 
				+      elapsed = time.time()-start_time
			
 
				+      throughput = target_tokens / elapsed
			
 
				+      latency = elapsed*1000
			
 
				+      meters['eval_throughput'].update(throughput)
			
 
				+      meters['eval_latency'].update(latency)
			
 
				+      target_tokens = 0
			
 
				+      if (step+1) % (num_batch // 10) == 0:
			
 
				+        tf.logging.info(format_str.format(step+1, num_batch))
			
 
				+        dllogger_data = {
			
 
				+            'eval_latency': latency,
			
 
				+            'eval_throughput': throughput,
			
 
				+        }
			
 
				+        dllogger.log(step=step+1, data=dllogger_data)
			
 
				+
			
 
				+
			
 
				+      start_time = time.time()
			
 
				+    avg_loss = total_loss / total_cnt
			
 
				+    latency_data = np.array(meters['eval_latency'].vals)
			
 
				+    tf.logging.info("Evaluating with: bs {}, math {} ".format(FLAGS.eval_batch_size, "fp16" if FLAGS.fp16 else "fp32"))
			
 
				+    tf.logging.info("| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}, tok/s {:>6.1f}, ms/batch {:>4.2f}".format(
			
 
				+        avg_loss, math.exp(avg_loss), avg_loss / math.log(2), meters['eval_throughput'].avg, meters['eval_latency'].avg))
			
 
				+    summary = {
			
 
				+        'eval_loss': avg_loss,
			
 
				+        'eval_ppl': math.exp(avg_loss),
			
 
				+        'eval_avg_throughput': meters['eval_throughput'].avg,
			
 
				+        'eval_avg_latency': meters['eval_latency'].avg,
			
 
				+    }
			
 
				+    for p in FLAGS.percentiles:
			
 
				+      p = int(p)
			
 
				+      tf.logging.info("Latency {}%: {:>4.2f} ms".format(
			
 
				+        p, np.percentile(latency_data, p)))
			
 
				+      summary[f'eval_{p}%_latency'] = np.percentile(latency_data, p)
			
 
				+    dllogger.log(step=tuple(), data=summary)
			
 
				+
			
 
				+
			
 
				+
			
 
				+def main(unused_argv):
			
 
				+  rank, local_rank, size = 0, 0, 1
			
 
				+  if FLAGS.horovod:
			
 
				+    hvd.init()
			
 
				+    rank = hvd.rank()
			
 
				+    local_rank = hvd.local_rank()
			
 
				+    size = hvd.size()
			
 
				+  del unused_argv  # Unused
			
 
				+
			
 
				+  tf.logging.set_verbosity(tf.logging.INFO)
			
 
				+
			
 
				+  if FLAGS.fp16:
			
 
				+      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
			
 
				+  else:
			
 
				+      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "0"
			
 
				+
			
 
				+  # Get corpus info
			
 
				+  corpus_info = data_utils.get_corpus_info(FLAGS.corpus_info_path)
			
 
				+  n_token = corpus_info["vocab_size"]
			
 
				+  cutoffs = corpus_info["cutoffs"][1:-1]
			
 
				+  tf.logging.info("n_token {}".format(n_token))
			
 
				+
			
 
				+  setup_dllogger(enabled=True, filename=FLAGS.raport_file, rank=rank)
			
 
				+
			
 
				+  if FLAGS.do_train:
			
 
				+    train(n_token, cutoffs, rank, local_rank, size)
			
 
				+  if FLAGS.do_eval:
			
 
				+    evaluate(n_token, cutoffs)
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+  tf.app.run()
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/model.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/model.py
@@ -0,0 +1,539 @@
 
				+import tensorflow as tf
			
 
				+
			
 
				+
			
 
				+def positional_embedding(pos_seq, inv_freq, bsz=None):
			
 
				+  sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq)
			
 
				+  pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
			
 
				+  if bsz is not None:
			
 
				+    return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
			
 
				+  else:
			
 
				+    return pos_emb[:, None, :]
			
 
				+
			
 
				+
			
 
				+def positionwise_FF(inp, d_model, d_inner, dropout, kernel_initializer,
			
 
				+                    scope='ff', is_training=True):
			
 
				+  output = inp
			
 
				+  with tf.variable_scope(scope):
			
 
				+    output = tf.layers.dense(inp, d_inner, activation=tf.nn.relu,
			
 
				+                             kernel_initializer=kernel_initializer,
			
 
				+                             name='layer_1')
			
 
				+    output = tf.layers.dropout(output, dropout, training=is_training,
			
 
				+                               name='drop_1')
			
 
				+    output = tf.layers.dense(output, d_model,
			
 
				+                             kernel_initializer=kernel_initializer,
			
 
				+                             name='layer_2')
			
 
				+    output = tf.layers.dropout(output, dropout, training=is_training,
			
 
				+                               name='drop_2')
			
 
				+    output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis=-1)
			
 
				+  return output
			
 
				+
			
 
				+
			
 
				+def rel_shift(x):
			
 
				+  x_size = tf.shape(x)
			
 
				+
			
 
				+  x = tf.pad(x, [[0, 0], [0, 0], [0, 0], [1, 0]])
			
 
				+  x = tf.reshape(x, [x_size[0], x_size[1], x_size[3] + 1, x_size[2]])
			
 
				+  x = tf.slice(x, [0, 0, 1, 0], [-1, -1, -1, -1])
			
 
				+  x = tf.reshape(x, x_size)
			
 
				+
			
 
				+  return x
			
 
				+
			
 
				+
			
 
				+def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model,
			
 
				+                       n_head, d_head, dropout, dropatt, is_training,
			
 
				+                       kernel_initializer, scope='rel_attn'):
			
 
				+  scale = 1 / (d_head ** 0.5)
			
 
				+  with tf.variable_scope(scope):
			
 
				+    qlen = tf.shape(w)[0]
			
 
				+    rlen = tf.shape(r)[0]
			
 
				+    bsz = tf.shape(w)[1]
			
 
				+
			
 
				+    cat = tf.concat([mems, w],
			
 
				+                    0) if mems is not None and mems.shape.ndims > 1 else w
			
 
				+    w_heads = tf.layers.dense(cat, 3 * n_head * d_head, use_bias=False,
			
 
				+                              kernel_initializer=kernel_initializer, name='qkv')
			
 
				+    r_head_k = tf.layers.dense(r, n_head * d_head, use_bias=False,
			
 
				+                               kernel_initializer=kernel_initializer, name='r')
			
 
				+
			
 
				+    w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, -1)
			
 
				+    w_head_q = w_head_q[-qlen:]
			
 
				+
			
 
				+    klen = tf.shape(w_head_k)[0]
			
 
				+
			
 
				+    w_head_q = tf.reshape(w_head_q, [qlen, bsz, n_head, d_head])
			
 
				+    w_head_k = tf.reshape(w_head_k, [klen, bsz, n_head, d_head])
			
 
				+    w_head_v = tf.reshape(w_head_v, [klen, bsz, n_head, d_head])
			
 
				+
			
 
				+    r_head_k = tf.reshape(r_head_k, [rlen, n_head, d_head])
			
 
				+
			
 
				+    rw_head_q = w_head_q + r_w_bias
			
 
				+    rr_head_q = w_head_q + r_r_bias
			
 
				+
			
 
				+    AC = tf.einsum('ibnd,jbnd->bnij', rw_head_q, w_head_k)
			
 
				+    BD = tf.einsum('ibnd,jnd->bnij', rr_head_q, r_head_k)
			
 
				+    BD = rel_shift(BD)
			
 
				+
			
 
				+    attn_score = (AC + BD) * scale
			
 
				+    attn_mask_t = attn_mask[None, None, :, :]
			
 
				+    attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t
			
 
				+
			
 
				+    attn_prob = tf.nn.softmax(attn_score, 3)
			
 
				+    attn_prob = tf.layers.dropout(attn_prob, dropatt, training=is_training)
			
 
				+
			
 
				+    attn_vec = tf.einsum('bnij,jbnd->ibnd', attn_prob, w_head_v)
			
 
				+    size_t = tf.shape(attn_vec)
			
 
				+    attn_vec = tf.reshape(attn_vec, [size_t[0], size_t[1], n_head * d_head])
			
 
				+
			
 
				+    attn_out = tf.layers.dense(attn_vec, d_model, use_bias=False,
			
 
				+                               kernel_initializer=kernel_initializer, name='o')
			
 
				+    attn_out = tf.layers.dropout(attn_out, dropout, training=is_training)
			
 
				+
			
 
				+    output = tf.contrib.layers.layer_norm(attn_out + w, begin_norm_axis=-1)
			
 
				+  return output
			
 
				+
			
 
				+
			
 
				+def embedding_lookup(lookup_table, x, use_tpu=True):
			
 
				+  if use_tpu:
			
 
				+    n_token = tf.shape(lookup_table)[0]
			
 
				+    one_hot_idx = tf.one_hot(x, n_token)
			
 
				+    if one_hot_idx.shape.ndims == 2:
			
 
				+      return tf.einsum('nd,in->id', lookup_table, one_hot_idx)
			
 
				+    else:
			
 
				+      return tf.einsum('nd,ibn->ibd', lookup_table, one_hot_idx)
			
 
				+  else:
			
 
				+    return tf.nn.embedding_lookup(lookup_table, x)
			
 
				+
			
 
				+
			
 
				+def mask_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,
			
 
				+                                   proj_initializer, div_val=1,
			
 
				+                                   proj_same_dim=True,
			
 
				+                                   scope='adaptive_embed', **kwargs):
			
 
				+  emb_scale = d_proj ** 0.5
			
 
				+  with tf.variable_scope(scope):
			
 
				+    if div_val == 1:
			
 
				+      lookup_table = tf.get_variable('lookup_table', [n_token, d_embed],
			
 
				+                                     initializer=initializer)
			
 
				+      y = embedding_lookup(lookup_table, x, use_tpu=False)
			
 
				+      if d_proj != d_embed:
			
 
				+        proj_W = tf.get_variable('proj_W', [d_embed, d_proj],
			
 
				+                                 initializer=proj_initializer)
			
 
				+        y = tf.einsum('ibe,ed->ibd', y, proj_W)
			
 
				+      else:
			
 
				+        proj_W = None
			
 
				+      ret_params = [lookup_table, proj_W]
			
 
				+    else:
			
 
				+      tables, projs = [], []
			
 
				+      cutoff_ends = [0] + cutoffs + [n_token]
			
 
				+      x_size = tf.shape(x)
			
 
				+      y = tf.zeros([x_size[0], x_size[1], d_proj])
			
 
				+      for i in range(len(cutoff_ends) - 1):
			
 
				+        with tf.variable_scope('cutoff_{}'.format(i)):
			
 
				+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
			
 
				+          mask = (x >= l_idx) & (x < r_idx)
			
 
				+          cur_x = tf.boolean_mask(x, mask) - l_idx
			
 
				+          cur_d_embed = d_embed // (div_val ** i)
			
 
				+          lookup_table = tf.get_variable('lookup_table',
			
 
				+                                         [r_idx - l_idx, cur_d_embed],
			
 
				+                                         initializer=initializer)
			
 
				+          cur_y = embedding_lookup(lookup_table, cur_x, use_tpu=False)
			
 
				+          if d_proj == cur_d_embed and not proj_same_dim:
			
 
				+            proj_W = None
			
 
				+          else:
			
 
				+            proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],
			
 
				+                                     initializer=proj_initializer)
			
 
				+            cur_y = tf.einsum('id,de->ie', cur_y, proj_W)
			
 
				+          mask_idx = tf.to_int64(tf.where(mask))
			
 
				+          y += tf.scatter_nd(mask_idx, cur_y, tf.to_int64(tf.shape(y)))
			
 
				+          tables.append(lookup_table)
			
 
				+          projs.append(proj_W)
			
 
				+      ret_params = [tables, projs]
			
 
				+
			
 
				+  y *= emb_scale
			
 
				+  return y, ret_params
			
 
				+
			
 
				+
			
 
				+def mul_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,
			
 
				+                                  proj_initializer, div_val=1, perms=None,
			
 
				+                                  proj_same_dim=True,
			
 
				+                                  scope='adaptive_embed'):
			
 
				+  """
			
 
				+  perms: If None, first compute W = W1 x W2 (projection for each bin),
			
 
				+      and then compute X x W (embedding lookup). If not None,
			
 
				+      use bin-based embedding lookup with max_bin_size defined by
			
 
				+      the shape of perms.
			
 
				+  """
			
 
				+  emb_scale = d_proj ** 0.5
			
 
				+  with tf.variable_scope(scope):
			
 
				+    if div_val == 1:
			
 
				+      lookup_table = tf.get_variable('lookup_table', [n_token, d_embed],
			
 
				+                                     initializer=initializer)
			
 
				+      y = embedding_lookup(lookup_table, x)
			
 
				+      if d_proj != d_embed:
			
 
				+        proj_W = tf.get_variable('proj_W', [d_embed, d_proj],
			
 
				+                                 initializer=proj_initializer)
			
 
				+        y = tf.einsum('ibe,ed->ibd', y, proj_W)
			
 
				+      else:
			
 
				+        proj_W = None
			
 
				+      ret_params = [lookup_table, proj_W]
			
 
				+    else:
			
 
				+      tables, projs = [], []
			
 
				+      cutoff_ends = [0] + cutoffs + [n_token]
			
 
				+      x_size = tf.shape(x)
			
 
				+      if perms is None:
			
 
				+        cat_lookup = []
			
 
				+      else:
			
 
				+        cat_lookup = tf.zeros([x_size[0], x_size[1], d_proj])
			
 
				+      for i in range(len(cutoff_ends) - 1):
			
 
				+        with tf.variable_scope('cutoff_{}'.format(i)):
			
 
				+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
			
 
				+          cur_d_embed = d_embed // (div_val ** i)
			
 
				+          lookup_table = tf.get_variable('lookup_table',
			
 
				+                                         [r_idx - l_idx, cur_d_embed],
			
 
				+                                         initializer=initializer)
			
 
				+          if cur_d_embed == d_proj and not proj_same_dim:
			
 
				+            proj_W = None
			
 
				+          else:
			
 
				+            proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],
			
 
				+                                   initializer=proj_initializer)
			
 
				+          if perms is None:
			
 
				+            cat_lookup.append(tf.einsum('ie,ed->id', lookup_table, proj_W))
			
 
				+          else:
			
 
				+            # speed up the computation of the first bin
			
 
				+            # also save some meory
			
 
				+            if i == 0:
			
 
				+              cur_y = embedding_lookup(lookup_table, tf.minimum(x, r_idx - 1))
			
 
				+              if proj_W is not None:
			
 
				+                cur_y = tf.einsum('ibe,ed->ibd', cur_y, proj_W)
			
 
				+              cur_y *= perms[i][:, :, None]
			
 
				+              cat_lookup += cur_y
			
 
				+            else:
			
 
				+              cur_x = tf.einsum('ib,ibk->k', tf.to_float(x - l_idx), perms[i])
			
 
				+              cur_x = tf.to_int32(cur_x)
			
 
				+              cur_y = embedding_lookup(lookup_table, cur_x)
			
 
				+              if proj_W is not None:
			
 
				+                cur_y = tf.einsum('ke,ed->kd', cur_y, proj_W)
			
 
				+              cat_lookup += tf.einsum('kd,ibk->ibd', cur_y, perms[i])
			
 
				+          tables.append(lookup_table)
			
 
				+          projs.append(proj_W)
			
 
				+      if perms is None:
			
 
				+        cat_lookup = tf.concat(cat_lookup, 0)
			
 
				+        y = embedding_lookup(cat_lookup, x)
			
 
				+      else:
			
 
				+        y = cat_lookup
			
 
				+      ret_params = [tables, projs]
			
 
				+
			
 
				+  y *= emb_scale
			
 
				+  return y, ret_params
			
 
				+
			
 
				+
			
 
				+def mask_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,
			
 
				+                             params, tie_projs,
			
 
				+                             initializer=None, proj_initializer=None,
			
 
				+                             div_val=1, scope='adaptive_softmax',
			
 
				+                             proj_same_dim=True,
			
 
				+                             return_mean=True, **kwargs):
			
 
				+  def _logit(x, W, b, proj):
			
 
				+    y = x
			
 
				+    if proj is not None:
			
 
				+      y = tf.einsum('ibd,ed->ibe', y, proj)
			
 
				+    return tf.einsum('ibd,nd->ibn', y, W) + b
			
 
				+
			
 
				+  params_W, params_projs = params[0], params[1]
			
 
				+
			
 
				+  def _gather_logprob(logprob, target):
			
 
				+    lp_size = tf.shape(logprob)
			
 
				+    r = tf.range(lp_size[0])
			
 
				+    idx = tf.stack([r, target], 1)
			
 
				+    return tf.gather_nd(logprob, idx)
			
 
				+
			
 
				+  with tf.variable_scope(scope):
			
 
				+    if len(cutoffs) == 0:
			
 
				+      softmax_b = tf.get_variable('bias', [n_token],
			
 
				+                                  initializer=tf.zeros_initializer())
			
 
				+      output = _logit(hidden, params_W, softmax_b, params_projs)
			
 
				+      nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target,
			
 
				+                                                           logits=output)
			
 
				+    else:
			
 
				+      cutoff_ends = [0] + cutoffs + [n_token]
			
 
				+      nll = tf.zeros_like(target, dtype=tf.float32)
			
 
				+      for i in range(len(cutoff_ends) - 1):
			
 
				+        with tf.variable_scope('cutoff_{}'.format(i)):
			
 
				+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
			
 
				+          mask = (target >= l_idx) & (target < r_idx)
			
 
				+          mask_idx = tf.where(mask)
			
 
				+          cur_target = tf.boolean_mask(target, mask) - l_idx
			
 
				+          cur_d_embed = d_embed // (div_val ** i)
			
 
				+
			
 
				+          if div_val == 1:
			
 
				+            cur_W = params_W[l_idx: r_idx]
			
 
				+          else:
			
 
				+            cur_W = params_W[i]
			
 
				+          cur_b = tf.get_variable('b', [r_idx - l_idx],
			
 
				+                                  initializer=tf.zeros_initializer())
			
 
				+          if tie_projs[i]:
			
 
				+            if div_val == 1:
			
 
				+              cur_proj = params_projs
			
 
				+            else:
			
 
				+              cur_proj = params_projs[i]
			
 
				+          else:
			
 
				+            if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed:
			
 
				+              cur_proj = None
			
 
				+            else:
			
 
				+              cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj],
			
 
				+                                         initializer=proj_initializer)
			
 
				+          if i == 0:
			
 
				+            cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],
			
 
				+                                        initializer=tf.zeros_initializer())
			
 
				+            cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],
			
 
				+                                        initializer=tf.zeros_initializer())
			
 
				+            cur_W = tf.concat([cur_W, cluster_W], 0)
			
 
				+            cur_b = tf.concat([cur_b, cluster_b], 0)
			
 
				+
			
 
				+            head_logit = _logit(hidden, cur_W, cur_b, cur_proj)
			
 
				+            head_logprob = tf.nn.log_softmax(head_logit)
			
 
				+            cur_head_logprob = tf.boolean_mask(head_logprob, mask)
			
 
				+            cur_logprob = _gather_logprob(cur_head_logprob, cur_target)
			
 
				+          else:
			
 
				+            cur_head_logprob = tf.boolean_mask(head_logprob, mask)
			
 
				+            cur_hidden = tf.boolean_mask(hidden, mask)
			
 
				+            tail_logit = tf.squeeze(_logit(
			
 
				+                cur_hidden[None], cur_W, cur_b, cur_proj), 0)
			
 
				+            tail_logprob = tf.nn.log_softmax(tail_logit)
			
 
				+            cur_logprob = (cur_head_logprob[:, cutoff_ends[1] + i - 1] +
			
 
				+                           _gather_logprob(tail_logprob, cur_target))
			
 
				+          nll += tf.scatter_nd(mask_idx, -cur_logprob,
			
 
				+                                 tf.to_int64(tf.shape(nll)))
			
 
				+  if return_mean:
			
 
				+    nll = tf.reduce_mean(nll)
			
 
				+  return nll
			
 
				+
			
 
				+
			
 
				+def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,
			
 
				+                            params, tie_projs,
			
 
				+                            initializer=None, proj_initializer=None,
			
 
				+                            div_val=1, perms=None, proj_same_dim=True,
			
 
				+                            scope='adaptive_softmax',
			
 
				+                            **kwargs):
			
 
				+  def _logit(x, W, b, proj):
			
 
				+    y = x
			
 
				+    if x.shape.ndims == 3:
			
 
				+      if proj is not None:
			
 
				+        y = tf.einsum('ibd,ed->ibe', y, proj)
			
 
				+      return tf.einsum('ibd,nd->ibn', y, W) + b
			
 
				+    else:
			
 
				+      if proj is not None:
			
 
				+        y = tf.einsum('id,ed->ie', y, proj)
			
 
				+      return tf.einsum('id,nd->in', y, W) + b
			
 
				+
			
 
				+  params_W, params_projs = params[0], params[1]
			
 
				+
			
 
				+  with tf.variable_scope(scope):
			
 
				+    if len(cutoffs) == 0:
			
 
				+      softmax_b = tf.get_variable('bias', [n_token],
			
 
				+                                  initializer=tf.zeros_initializer())
			
 
				+      output = _logit(hidden, params_W, softmax_b, params_projs)
			
 
				+      nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target,
			
 
				+                                                           logits=output)
			
 
				+      nll = tf.reduce_mean(nll)
			
 
				+    else:
			
 
				+      total_loss, total_cnt = 0, 0
			
 
				+      cutoff_ends = [0] + cutoffs + [n_token]
			
 
				+      for i in range(len(cutoff_ends) - 1):
			
 
				+        with tf.variable_scope('cutoff_{}'.format(i)):
			
 
				+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
			
 
				+
			
 
				+          cur_d_embed = d_embed // (div_val ** i)
			
 
				+
			
 
				+          if div_val == 1:
			
 
				+            cur_W = params_W[l_idx: r_idx]
			
 
				+          else:
			
 
				+            cur_W = params_W[i]
			
 
				+          cur_b = tf.get_variable('b', [r_idx - l_idx],
			
 
				+                                  initializer=tf.zeros_initializer())
			
 
				+          if tie_projs[i]:
			
 
				+            if div_val == 1:
			
 
				+              cur_proj = params_projs
			
 
				+            else:
			
 
				+              cur_proj = params_projs[i]
			
 
				+          else:
			
 
				+            if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed:
			
 
				+              cur_proj = None
			
 
				+            else:
			
 
				+              cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj],
			
 
				+                                         initializer=proj_initializer)
			
 
				+
			
 
				+          if i == 0:
			
 
				+            cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],
			
 
				+                                        initializer=tf.zeros_initializer())
			
 
				+            cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],
			
 
				+                                        initializer=tf.zeros_initializer())
			
 
				+            cur_W = tf.concat([cur_W, cluster_W], 0)
			
 
				+            cur_b = tf.concat([cur_b, cluster_b], 0)
			
 
				+
			
 
				+            head_logit = _logit(hidden, cur_W, cur_b, cur_proj)
			
 
				+
			
 
				+            head_target = kwargs.get("head_target")
			
 
				+            head_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(
			
 
				+                labels=head_target,
			
 
				+                logits=head_logit)
			
 
				+
			
 
				+            masked_loss = head_nll * perms[i]
			
 
				+            total_loss += tf.reduce_sum(masked_loss)
			
 
				+            total_cnt += tf.reduce_sum(perms[i])
			
 
				+          else:
			
 
				+            cur_head_nll = tf.einsum('ib,ibk->k', head_nll, perms[i])
			
 
				+
			
 
				+            cur_hidden = tf.einsum('ibd,ibk->kd', hidden, perms[i])
			
 
				+            tail_logit = _logit(cur_hidden, cur_W, cur_b, cur_proj)
			
 
				+
			
 
				+            tail_target = tf.einsum('ib,ibk->k', tf.to_float(target - l_idx),
			
 
				+                                    perms[i])
			
 
				+            tail_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(
			
 
				+                labels=tf.to_int32(tail_target),
			
 
				+                logits=tail_logit)
			
 
				+
			
 
				+            sum_nll = cur_head_nll + tail_nll
			
 
				+            mask = tf.reduce_sum(perms[i], [0, 1])
			
 
				+
			
 
				+            masked_loss = sum_nll * mask
			
 
				+            total_loss += tf.reduce_sum(masked_loss)
			
 
				+            total_cnt += tf.reduce_sum(mask)
			
 
				+
			
 
				+      nll = total_loss / total_cnt
			
 
				+
			
 
				+  return nll
			
 
				+
			
 
				+
			
 
				+def _create_mask(qlen, mlen, same_length=False):
			
 
				+  attn_mask = tf.ones([qlen, qlen])
			
 
				+  mask_u = tf.matrix_band_part(attn_mask, 0, -1)
			
 
				+  mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
			
 
				+  attn_mask_pad = tf.zeros([qlen, mlen])
			
 
				+  ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
			
 
				+  if same_length:
			
 
				+    mask_l = tf.matrix_band_part(attn_mask, -1, 0)
			
 
				+    ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
			
 
				+  return ret
			
 
				+
			
 
				+def _cache_mem(curr_out, prev_mem, mem_len=None):
			
 
				+  if mem_len is None or prev_mem is None:
			
 
				+    new_mem = curr_out
			
 
				+  elif mem_len == 0:
			
 
				+    return prev_mem
			
 
				+  else:
			
 
				+    new_mem = tf.concat([prev_mem, curr_out], 0)[- mem_len:]
			
 
				+
			
 
				+  return tf.stop_gradient(new_mem)
			
 
				+
			
 
				+
			
 
				+def transformer(dec_inp, target, mems, n_token, n_layer, d_model, d_embed,
			
 
				+                n_head, d_head, d_inner, dropout, dropatt,
			
 
				+                initializer, is_training, proj_initializer=None,
			
 
				+                mem_len=None, cutoffs=[], div_val=1, tie_projs=[],
			
 
				+                same_length=False, clamp_len=-1, use_tpu=False,
			
 
				+                input_perms=None, target_perms=None, head_target=None,
			
 
				+                untie_r=False, proj_same_dim=True,
			
 
				+                scope='transformer'):
			
 
				+  """
			
 
				+  cutoffs: a list of python int. Cutoffs for adaptive softmax.
			
 
				+  tie_projs: a list of python bools. Whether to tie the projections.
			
 
				+  use_tpu: if True, use one_hot in embedding lookup and bin-based implementation
			
 
				+        of adaptive softmax.
			
 
				+  perms: a list of tensors. Each tensor should of size [len, bsz, bin_size].
			
 
				+        Only used in the adaptive setting.
			
 
				+  """
			
 
				+  new_mems = []
			
 
				+  with tf.variable_scope(scope):
			
 
				+    if untie_r:
			
 
				+      r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head],
			
 
				+                               initializer=initializer)
			
 
				+      r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head],
			
 
				+                                 initializer=initializer)
			
 
				+    else:
			
 
				+      r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head],
			
 
				+                                 initializer=initializer)
			
 
				+      r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head],
			
 
				+                                 initializer=initializer)
			
 
				+
			
 
				+    qlen = tf.shape(dec_inp)[0]
			
 
				+    mlen = tf.shape(mems[0])[0] if mems is not None else 0
			
 
				+    klen = mlen + qlen
			
 
				+
			
 
				+    if proj_initializer is None:
			
 
				+      proj_initializer = initializer
			
 
				+    lookup_fn = (mul_adaptive_embedding_lookup if use_tpu else
			
 
				+                 mask_adaptive_embedding_lookup)
			
 
				+    embeddings, shared_params = lookup_fn(
			
 
				+        x=dec_inp,
			
 
				+        n_token=n_token,
			
 
				+        d_embed=d_embed,
			
 
				+        d_proj=d_model,
			
 
				+        cutoffs=cutoffs,
			
 
				+        initializer=initializer,
			
 
				+        proj_initializer=proj_initializer,
			
 
				+        div_val= div_val,
			
 
				+        perms=input_perms,
			
 
				+        proj_same_dim=proj_same_dim)
			
 
				+
			
 
				+    attn_mask = _create_mask(qlen, mlen, same_length)
			
 
				+
			
 
				+    pos_seq = tf.range(klen - 1, -1, -1.0)
			
 
				+    if clamp_len > 0:
			
 
				+      pos_seq = tf.minimum(pos_seq, clamp_len)
			
 
				+    inv_freq = 1 / (10000 ** (tf.range(0, d_model, 2.0) / d_model))
			
 
				+    pos_emb = positional_embedding(pos_seq, inv_freq)
			
 
				+
			
 
				+    output = tf.layers.dropout(embeddings, dropout, training=is_training)
			
 
				+    pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training)
			
 
				+
			
 
				+    if mems is None:
			
 
				+      mems = [None] * n_layer
			
 
				+
			
 
				+    for i in range(n_layer):
			
 
				+      # cache new mems
			
 
				+      new_mems.append(_cache_mem(output, mems[i], mem_len))
			
 
				+
			
 
				+      with tf.variable_scope('layer_{}'.format(i)):
			
 
				+        output = rel_multihead_attn(
			
 
				+            w=output,
			
 
				+            r=pos_emb,
			
 
				+            r_w_bias=r_w_bias if not untie_r else r_w_bias[i],
			
 
				+            r_r_bias=r_r_bias if not untie_r else r_r_bias[i],
			
 
				+            attn_mask=attn_mask,
			
 
				+            mems=mems[i],
			
 
				+            d_model=d_model,
			
 
				+            n_head=n_head,
			
 
				+            d_head=d_head,
			
 
				+            dropout=dropout,
			
 
				+            dropatt=dropatt,
			
 
				+            is_training=is_training,
			
 
				+            kernel_initializer=initializer)
			
 
				+        output = positionwise_FF(
			
 
				+            inp=output,
			
 
				+            d_model=d_model,
			
 
				+            d_inner=d_inner,
			
 
				+            dropout=dropout,
			
 
				+            kernel_initializer=initializer,
			
 
				+            is_training=is_training)
			
 
				+
			
 
				+    output = tf.layers.dropout(output, dropout, training=is_training)
			
 
				+
			
 
				+    logsoftmax_fn = (mul_adaptive_logsoftmax if use_tpu else
			
 
				+                     mask_adaptive_logsoftmax)
			
 
				+    loss = logsoftmax_fn(
			
 
				+        hidden=output,
			
 
				+        target=target,
			
 
				+        n_token=n_token,
			
 
				+        d_embed=d_embed,
			
 
				+        d_proj=d_model,
			
 
				+        cutoffs=cutoffs,
			
 
				+        params=shared_params,
			
 
				+        tie_projs=tie_projs,
			
 
				+        initializer=initializer,
			
 
				+        proj_initializer=proj_initializer,
			
 
				+        div_val=div_val,
			
 
				+        perms=target_perms,
			
 
				+        head_target=head_target,
			
 
				+        proj_same_dim=proj_same_dim)
			
 
				+    return loss, new_mems
			
 
				+
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/run_wt103_base.sh
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/run_wt103_base.sh
@@ -0,0 +1,98 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# Data
			
 
				+DATA_ROOT=../data/wikitext-103/
			
 
				+
			
 
				+# Model
			
 
				+DIV_VAL=1
			
 
				+N_LAYER=16
			
 
				+D_MODEL=512
			
 
				+D_EMBED=512
			
 
				+N_HEAD=8
			
 
				+D_HEAD=64
			
 
				+D_INNER=2048
			
 
				+
			
 
				+# Training
			
 
				+TGT_LEN=192
			
 
				+MEM_LEN=192
			
 
				+
			
 
				+NUM_CORE=${2:-"8"}
			
 
				+
			
 
				+# Testing
			
 
				+TEST_TGT_LEN=64
			
 
				+TEST_MEM_LEN=640
			
 
				+TEST_CLAMP_LEN=400
			
 
				+
			
 
				+TEST_NUM_CORE=1
			
 
				+
			
 
				+
			
 
				+if [[ $1 == 'train_data' ]]; then
			
 
				+    python data_utils.py \
			
 
				+        --data_dir=${DATA_ROOT}/ \
			
 
				+        --dataset=wt103 \
			
 
				+        --tgt_len=${TGT_LEN} \
			
 
				+        --num_passes=2 \
			
 
				+        --use_tpu=False \
			
 
				+        --eval_batch_size=0 \
			
 
				+        ${@:2}
			
 
				+elif [[ $1 == 'test_data' ]]; then
			
 
				+    python data_utils.py \
			
 
				+        --data_dir=${DATA_ROOT}/ \
			
 
				+        --dataset=enwik8 \
			
 
				+        --tgt_len=${TEST_TGT_LEN} \
			
 
				+        --num_passes=1 \
			
 
				+        --use_tpu=False \
			
 
				+        ${@:2}
			
 
				+elif [[ $1 == 'train' ]]; then
			
 
				+    echo 'Run training...'
			
 
				+    horovodrun -np ${NUM_CORE} -H localhost:${NUM_CORE} python main.py \
			
 
				+        --data_dir=${DATA_ROOT}/tfrecords \
			
 
				+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
			
 
				+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
			
 
				+        --div_val=${DIV_VAL} \
			
 
				+        --untie_r=True \
			
 
				+        --proj_share_all_but_first=True \
			
 
				+        --n_layer=${N_LAYER} \
			
 
				+        --d_model=${D_MODEL} \
			
 
				+        --d_embed=${D_EMBED} \
			
 
				+        --n_head=${N_HEAD} \
			
 
				+        --d_head=${D_HEAD} \
			
 
				+        --d_inner=${D_INNER} \
			
 
				+        --dropout=0.1 \
			
 
				+        --dropatt=0.0 \
			
 
				+        --learning_rate=0.01 \
			
 
				+        --warmup_steps=1000 \
			
 
				+        --tgt_len=${TGT_LEN} \
			
 
				+        --mem_len=${MEM_LEN} \
			
 
				+        --num_core_per_host=${NUM_CORE} \
			
 
				+        ${@:3}
			
 
				+elif [[ $1 == 'eval' ]]; then
			
 
				+    echo 'Run evaluation...'
			
 
				+    python main.py \
			
 
				+        --data_dir=${DATA_ROOT}/tfrecords \
			
 
				+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
			
 
				+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
			
 
				+        --div_val=${DIV_VAL} \
			
 
				+        --untie_r=True \
			
 
				+        --proj_share_all_but_first=True \
			
 
				+        --n_layer=${N_LAYER} \
			
 
				+        --d_model=${D_MODEL} \
			
 
				+        --d_embed=${D_EMBED} \
			
 
				+        --n_head=${N_HEAD} \
			
 
				+        --d_head=${D_HEAD} \
			
 
				+        --d_inner=${D_INNER} \
			
 
				+        --dropout=0.0 \
			
 
				+        --dropatt=0.0 \
			
 
				+        --tgt_len=${TEST_TGT_LEN} \
			
 
				+        --mem_len=${TEST_MEM_LEN} \
			
 
				+        --clamp_len=${TEST_CLAMP_LEN} \
			
 
				+        --same_length=True \
			
 
				+        --num_core_per_host=${TEST_NUM_CORE} \
			
 
				+        --do_train=False \
			
 
				+        --do_eval=True \
			
 
				+        --horovod=False \
			
 
				+        --eval_split=test \
			
 
				+        ${@:2}
			
 
				+else
			
 
				+    echo 'unknown argment 1'
			
 
				+fi
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/build.sh
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/build.sh
@@ -0,0 +1,17 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
			
 
				+# 
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+# 
			
 
				+#       http://www.apache.org/licenses/LICENSE-2.0
			
 
				+# 
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+docker build . --network=host --rm -t transformer-xl:latest
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/interactive.sh
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/interactive.sh
@@ -0,0 +1,17 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
			
 
				+# 
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+# 
			
 
				+#       http://www.apache.org/licenses/LICENSE-2.0
			
 
				+# 
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+nvidia-docker run --init -it --rm --network=host --ipc=host -v $PWD:/workspace/transformer-xl transformer-xl bash
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/inference_benchmark.sh
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/inference_benchmark.sh
@@ -0,0 +1,30 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
			
 
				+# 
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+# 
			
 
				+#       http://www.apache.org/licenses/LICENSE-2.0
			
 
				+# 
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+BATCH_SIZES=(1 2 4 8 16 32)
			
 
				+# "empty" MATH corresponds to fp32
			
 
				+MATHS=("" "--fp16")
			
 
				+
			
 
				+
			
 
				+for (( j = 0; j < ${#BATCH_SIZES[@]}; j++ )); do
			
 
				+   for (( k = 0; k < ${#MATHS[@]}; k++ )); do
			
 
				+      echo batch size: ${BATCH_SIZES[j]} math: ${MATHS[k]}
			
 
				+      taskset -c 0 bash run_wt103_base.sh eval \
			
 
				+         --eval_batch_size "${BATCH_SIZES[j]}" \
			
 
				+         "${MATHS[k]}" \
			
 
				+         "${@:1}"
			
 
				+   done
			
 
				+done
			
--- a/TensorFlow/LanguageModeling/Transformer-XL/tf/vocabulary.py
+++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/vocabulary.py
@@ -0,0 +1,170 @@
 
				+from __future__ import absolute_import
			
 
				+from __future__ import division
			
 
				+from __future__ import print_function
			
 
				+
			
 
				+from collections import Counter, OrderedDict
			
 
				+
			
 
				+import numpy as np
			
 
				+
			
 
				+import tensorflow as tf
			
 
				+
			
 
				+from tensorflow.gfile import Open as open
			
 
				+from tensorflow.gfile import Exists as exists
			
 
				+
			
 
				+class Vocab(object):
			
 
				+  def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True,
			
 
				+         delimiter=None, vocab_file=None):
			
 
				+    self.counter = Counter()
			
 
				+    self.special = special
			
 
				+    self.min_freq = min_freq
			
 
				+    self.max_size = max_size
			
 
				+    self.lower_case = lower_case
			
 
				+    self.delimiter = delimiter
			
 
				+    self.vocab_file = vocab_file
			
 
				+
			
 
				+  def tokenize(self, line, add_eos=False, add_double_eos=False):
			
 
				+    line = line.strip()
			
 
				+    # convert to lower case
			
 
				+    if self.lower_case:
			
 
				+      line = line.lower()
			
 
				+
			
 
				+    # empty delimiter '' will evaluate False
			
 
				+    if self.delimiter == '':
			
 
				+      symbols = line
			
 
				+    else:
			
 
				+      symbols = line.split(self.delimiter)
			
 
				+
			
 
				+    if add_double_eos: # lm1b
			
 
				+      return ['<S>'] + symbols + ['<S>']
			
 
				+    elif add_eos:
			
 
				+      return symbols + ['<eos>']
			
 
				+    else:
			
 
				+      return symbols
			
 
				+
			
 
				+  def count_file(self, path, verbose=False, add_eos=False):
			
 
				+    if verbose: print('counting file {} ...'.format(path))
			
 
				+    assert exists(path)
			
 
				+
			
 
				+    sents = []
			
 
				+    with open(path, 'r') as f:
			
 
				+      for idx, line in enumerate(f):
			
 
				+        if verbose and idx > 0 and idx % 500000 == 0:
			
 
				+          print('  line {}'.format(idx))
			
 
				+        symbols = self.tokenize(line, add_eos=add_eos)
			
 
				+        self.counter.update(symbols)
			
 
				+        sents.append(symbols)
			
 
				+
			
 
				+    return sents
			
 
				+
			
 
				+  def count_sents(self, sents, verbose=False):
			
 
				+    """
			
 
				+      sents : a list of sentences, each a list of tokenized symbols
			
 
				+    """
			
 
				+    if verbose: print('counting {} sents ...'.format(len(sents)))
			
 
				+    for idx, symbols in enumerate(sents):
			
 
				+      if verbose and idx > 0 and idx % 500000 == 0:
			
 
				+        print('  line {}'.format(idx))
			
 
				+      self.counter.update(symbols)
			
 
				+
			
 
				+  def _build_from_file(self, vocab_file):
			
 
				+    self.idx2sym = []
			
 
				+    self.sym2idx = OrderedDict()
			
 
				+
			
 
				+    with open(vocab_file, 'r') as f:
			
 
				+      for line in f:
			
 
				+        symb = line.strip().split()[0]
			
 
				+        self.add_symbol(symb)
			
 
				+    self.unk_idx = self.sym2idx['<UNK>']
			
 
				+
			
 
				+  def build_vocab(self):
			
 
				+    if self.vocab_file:
			
 
				+      print('building vocab from {}'.format(self.vocab_file))
			
 
				+      self._build_from_file(self.vocab_file)
			
 
				+      print('final vocab size {}'.format(len(self)))
			
 
				+    else:
			
 
				+      print('building vocab with min_freq={}, max_size={}'.format(
			
 
				+        self.min_freq, self.max_size))
			
 
				+      self.idx2sym = []
			
 
				+      self.sym2idx = OrderedDict()
			
 
				+
			
 
				+      for sym in self.special:
			
 
				+        self.add_special(sym)
			
 
				+
			
 
				+      for sym, cnt in self.counter.most_common(self.max_size):
			
 
				+        if cnt < self.min_freq: break
			
 
				+        self.add_symbol(sym)
			
 
				+
			
 
				+      print('final vocab size {} from {} unique tokens'.format(
			
 
				+        len(self), len(self.counter)))
			
 
				+
			
 
				+  def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
			
 
				+          add_double_eos=False):
			
 
				+    if verbose: print('encoding file {} ...'.format(path))
			
 
				+    assert exists(path)
			
 
				+    encoded = []
			
 
				+    with open(path, 'r') as f:
			
 
				+      for idx, line in enumerate(f):
			
 
				+        if verbose and idx > 0 and idx % 500000 == 0:
			
 
				+          print('  line {}'.format(idx))
			
 
				+        symbols = self.tokenize(line, add_eos=add_eos,
			
 
				+          add_double_eos=add_double_eos)
			
 
				+        encoded.append(self.convert_to_nparray(symbols))
			
 
				+
			
 
				+    if ordered:
			
 
				+      encoded = np.concatenate(encoded)
			
 
				+
			
 
				+    return encoded
			
 
				+
			
 
				+  def encode_sents(self, sents, ordered=False, verbose=False):
			
 
				+    if verbose: print('encoding {} sents ...'.format(len(sents)))
			
 
				+    encoded = []
			
 
				+    for idx, symbols in enumerate(sents):
			
 
				+      if verbose and idx > 0 and idx % 500000 == 0:
			
 
				+        print('  line {}'.format(idx))
			
 
				+      encoded.append(self.convert_to_nparray(symbols))
			
 
				+
			
 
				+    if ordered:
			
 
				+      encoded = np.concatenate(encoded)
			
 
				+
			
 
				+    return encoded
			
 
				+
			
 
				+  def add_special(self, sym):
			
 
				+    if sym not in self.sym2idx:
			
 
				+      self.idx2sym.append(sym)
			
 
				+      self.sym2idx[sym] = len(self.idx2sym) - 1
			
 
				+      setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
			
 
				+
			
 
				+  def add_symbol(self, sym):
			
 
				+    if sym not in self.sym2idx:
			
 
				+      self.idx2sym.append(sym)
			
 
				+      self.sym2idx[sym] = len(self.idx2sym) - 1
			
 
				+
			
 
				+  def get_sym(self, idx):
			
 
				+    assert 0 <= idx < len(self), 'Index {} out of range'.format(idx)
			
 
				+    return self.idx2sym[idx]
			
 
				+
			
 
				+  def get_idx(self, sym):
			
 
				+    if sym in self.sym2idx:
			
 
				+      return self.sym2idx[sym]
			
 
				+    else:
			
 
				+      assert hasattr(self, 'unk_idx')
			
 
				+      return self.sym2idx.get(sym, self.unk_idx)
			
 
				+
			
 
				+  def get_symbols(self, indices):
			
 
				+    return [self.get_sym(idx) for idx in indices]
			
 
				+
			
 
				+  def get_indices(self, symbols):
			
 
				+    return [self.get_idx(sym) for sym in symbols]
			
 
				+
			
 
				+  def convert_to_nparray(self, symbols):
			
 
				+    nparray = np.array(self.get_indices(symbols), dtype=np.int64)
			
 
				+    return nparray
			
 
				+
			
 
				+  def convert_to_sent(self, indices, exclude=None):
			
 
				+    if exclude is None:
			
 
				+      return ' '.join([self.get_sym(idx) for idx in indices])
			
 
				+    else:
			
 
				+      return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
			
 
				+
			
 
				+  def __len__(self):
			
 
				+    return len(self.idx2sym)