Просмотр исходного кода

[Transformer-XL/PyT] Added Ampere support

Przemek Strzelczyk 5 лет назад
Родитель
Сommit
f838cf3292
29 измененных файлов с 1010 добавлено и 521 удалено
  1. 7 0
      PyTorch/LanguageModeling/Transformer-XL/.dockerignore
  2. 3 5
      PyTorch/LanguageModeling/Transformer-XL/Dockerfile
  3. 386 250
      PyTorch/LanguageModeling/Transformer-XL/README.md
  4. 0 5
      PyTorch/LanguageModeling/Transformer-XL/pytorch/.dockerignore
  5. 9 2
      PyTorch/LanguageModeling/Transformer-XL/pytorch/data_utils.py
  6. 45 7
      PyTorch/LanguageModeling/Transformer-XL/pytorch/eval.py
  7. 20 4
      PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/mem_transformer_jit.py
  8. 61 11
      PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/proj_adaptive_softmax_jit.py
  9. 2 1
      PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py
  10. 25 27
      PyTorch/LanguageModeling/Transformer-XL/pytorch/mem_transformer.py
  11. 4 5
      PyTorch/LanguageModeling/Transformer-XL/pytorch/run.sub
  12. 2 10
      PyTorch/LanguageModeling/Transformer-XL/pytorch/run_multinode_wt103_large.sh
  13. 1 1
      PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/docker/interactive.sh
  14. 8 0
      PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/inference_benchmark.sh
  15. 5 6
      PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/infer_bench.sh
  16. 12 6
      PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/reference_inference_throughput
  17. 10 10
      PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/reference_training_throughput
  18. 7 25
      PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/train_bench.sh
  19. 5 25
      PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/train_full.sh
  20. 5 24
      PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/train_long.sh
  21. 5 24
      PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/train_short.sh
  22. 144 66
      PyTorch/LanguageModeling/Transformer-XL/pytorch/train.py
  23. 1 0
      PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/__init__.py
  24. 47 0
      PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/exp_utils.py
  25. 49 0
      PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/gpu_affinity.py
  26. 15 1
      PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/proj_adaptive_softmax.py
  27. 64 2
      PyTorch/LanguageModeling/Transformer-XL/pytorch/wt103_base.yaml
  28. 67 4
      PyTorch/LanguageModeling/Transformer-XL/pytorch/wt103_large.yaml
  29. 1 0
      PyTorch/LanguageModeling/Transformer-XL/requirements.txt

+ 7 - 0
PyTorch/LanguageModeling/Transformer-XL/.dockerignore

@@ -0,0 +1,7 @@
+pytorch/LM-TFM*
+pytorch/internal/result*
+pytorch/*.out
+pytorch/*.log
+pytorch/*.json
+chkpt/*
+data/*

+ 3 - 5
PyTorch/LanguageModeling/Transformer-XL/pytorch/Dockerfile → PyTorch/LanguageModeling/Transformer-XL/Dockerfile

@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.11-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
 FROM ${FROM_IMAGE_NAME}
 
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8
 
-WORKDIR /tmp/unique_for_apex
-RUN git clone https://github.com/NVIDIA/apex.git && cd apex && git reset --hard 3ae89c754d945e407a6674aa2006d5a0e35d540e
-RUN cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
+RUN pip install --global-option="--cpp_ext" --global-option="--cuda_ext" git+git://github.com/NVIDIA/apex.git#egg=apex
 
 WORKDIR /workspace/transformer-xl/pytorch
 
@@ -28,4 +26,4 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip install --no-cache-dir git+https://github.com/NVIDIA/dllogger.git#egg=dllogger
 
-ADD . /workspace/transformer-xl/pytorch
+ADD pytorch/ /workspace/transformer-xl/pytorch/

Разница между файлами не показана из-за своего большого размера
+ 386 - 250
PyTorch/LanguageModeling/Transformer-XL/README.md


+ 0 - 5
PyTorch/LanguageModeling/Transformer-XL/pytorch/.dockerignore

@@ -1,5 +0,0 @@
-LM-TFM*
-internal/result*
-*.out
-*.log
-*.json

+ 9 - 2
PyTorch/LanguageModeling/Transformer-XL/pytorch/data_utils.py

@@ -63,10 +63,14 @@ class LMOrderedIterator(object):
         # Number of mini-batches
         self.n_batch = (self.data.size(0) + self.bptt - 1) // self.bptt
 
-    def roll(self):
+        self.last_iter = None
+
+    def roll(self, seed):
+        rng = torch.Generator()
+        rng.manual_seed(seed)
         for i in range(self.data.size(1)):
             row = self.data[:, i]
-            shift = torch.randint(0, self.data.size(0), (1,))
+            shift = torch.randint(0, self.data.size(0), (1,), generator=rng)
             row = torch.cat((row[shift:], row[:shift]))
             self.data[:, i] = row
 
@@ -90,7 +94,10 @@ class LMOrderedIterator(object):
         return data, target, seq_len, warm
 
     def get_fixlen_iter(self, start=0):
+        if start != 0:
+            start += self.bptt
         for i in range(start, self.data.size(0) - 1, self.bptt):
+            self.last_iter = i
             yield self.get_batch(i)
 
     def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):

+ 45 - 7
PyTorch/LanguageModeling/Transformer-XL/pytorch/eval.py

@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import argparse
+import json
 import logging
 import math
 import os
@@ -32,6 +33,7 @@ from data_utils import tokenize_raw
 from utils.exp_utils import AverageMeter
 from utils.exp_utils import benchmark
 from utils.exp_utils import create_exp_dir
+from utils.exp_utils import l2_promote
 from utils.exp_utils import log_env_info
 
 
@@ -46,7 +48,7 @@ def parse_args():
     cfg_parser = argparse.ArgumentParser(parents=[parent_parser], add_help=False)
 
     cfg_parser.add_argument('--config', default='default')
-    cfg_parser.add_argument('--config_file', default='config.yaml')
+    cfg_parser.add_argument('--config_file', default=None)
 
     config_args, _ = cfg_parser.parse_known_args()
 
@@ -81,16 +83,25 @@ def parse_args():
                         help='length of the extended context')
     parser.add_argument('--mem_len', type=int, default=640,
                         help='length of the retained previous heads')
+    parser.add_argument('--seed', type=int, default=1111,
+                        help='Random seed')
     parser.add_argument('--clamp_len', type=int, default=-1,
                         help='max positional embedding index')
     parser.add_argument('--cuda', action='store_true',
                         help='Run evaluation on a GPU using CUDA')
     parser.add_argument('--model', type=str, default='',
                         help='path to the checkpoint')
+    parser.add_argument('--manual_config', type=json.loads, default=None,
+                        help='Manually specify config for the model')
+    parser.add_argument('--manual_vocab', type=str, default='word',
+                        choices=['word', 'bpe'],
+                        help='Manually specify type of vocabulary')
     parser.add_argument('--fp16', action='store_true',
                         help='Run training in fp16/mixed precision')
     parser.add_argument('--log_all_ranks', action='store_true',
                         help='Enable logging for all distributed ranks')
+    parser.add_argument('--dllog_file', type=str, default='eval_log.json',
+                        help='Name of the DLLogger output file')
     parser.add_argument('--same_length', action='store_true',
                         help='set same length attention with masking')
     parser.add_argument('--no_env', action='store_true',
@@ -208,7 +219,7 @@ def evaluate(eval_iter, model, meters, log_interval, max_size=None, repeat=1):
                         'eval_loss': log_loss,
                         'eval_perplexity': log_ppl,
                         }
-                    dllogger.log(step=eval_step, data=dllogger_data)
+                    dllogger.log(step=tuple([eval_step]), data=dllogger_data)
 
                     log_throughput = 0
                     log_latency = 0
@@ -240,6 +251,7 @@ def compile_model(model, device, args):
 
 def main():
     args = parse_args()
+    utils.gpu_affinity.set_affinity(args.local_rank)
 
     if args.type == 'pytorch':
         from mem_transformer import MemTransformerLM
@@ -247,6 +259,7 @@ def main():
         from inference.mem_transformer_jit import MemTransformerLM
 
     torch.cuda.set_device(args.local_rank)
+    l2_promote()
     device = torch.device('cuda' if args.cuda else 'cpu')
     utils.distributed.init_distributed(args.cuda)
 
@@ -260,7 +273,7 @@ def main():
     else:
         log_file = f'eval_log.log'
 
-    dllog_file = f'eval_log.json'
+    dllog_file = args.dllog_file
     log_file = os.path.join(args.work_dir, log_file)
     dllog_file = os.path.join(args.work_dir, dllog_file)
     if args.debug:
@@ -279,6 +292,10 @@ def main():
     if not args.no_env:
         log_env_info()
 
+    # Set the random seed manually for reproducibility.
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
     if args.model:
         model_path = args.model
     elif args.work_dir:
@@ -286,7 +303,12 @@ def main():
     else:
         raise RuntimeError('Specify path to checkpoint using --model or --work_dir')
 
-    checkpoint = load_checkpoint(model_path)
+    if not args.manual_config:
+        checkpoint = load_checkpoint(model_path)
+        vocab_type = checkpoint['args'].vocab
+    else:
+        checkpoint = None
+        vocab_type = args.manual_vocab
 
     if args.manual:
         vocab = checkpoint['vocab']
@@ -304,7 +326,7 @@ def main():
                                             ext_len=args.ext_len, warmup=False)
     else:
         # Load dataset
-        corpus = get_lm_corpus(args.data, args.dataset, checkpoint['args'].vocab)
+        corpus = get_lm_corpus(args.data, args.dataset, vocab_type)
 
         if args.split == 'valid' or args.split == 'test':
             iter = corpus.get_iterator(args.split, args.batch_size, args.tgt_len,
@@ -322,7 +344,7 @@ def main():
 
     if args.load_torchscript:
         model = torch.jit.load(args.load_torchscript)
-    else:
+    elif not args.manual_config:
         checkpoint['model_config']['tgt_len'] = args.tgt_len
         checkpoint['model_config']['ext_len'] = args.ext_len
         checkpoint['model_config']['mem_len'] = args.mem_len
@@ -335,12 +357,21 @@ def main():
             model.load_state_dict(checkpoint['model_state'])
         elif args.type == 'torchscript':
             model.load_state_dict(checkpoint['model_state'], strict=False)
+    elif args.manual_config:
+        args.manual_config['tgt_len'] = args.tgt_len
+        args.manual_config['ext_len'] = args.ext_len
+        args.manual_config['mem_len'] = args.mem_len
+        args.manual_config['clamp_len'] = args.clamp_len
+        args.manual_config['same_length'] = args.same_length
+        args.manual_config['dtype'] = dtype
+
+        model = MemTransformerLM(**args.manual_config)
 
     model = model.eval()
     model = model.to(device)
     model = model.to(dtype)
 
-    if args.type == 'torchscript':
+    if args.type == 'torchscript' and not args.manual_config:
         state = checkpoint['model_state']
 
         tie_projs = checkpoint['model_config']['tie_projs']
@@ -444,4 +475,11 @@ def main():
 
 
 if __name__ == "__main__":
+    # Disable profiling executor
+    try:
+        torch._C._jit_set_profiling_executor(False)
+        torch._C._jit_set_profiling_mode(False)
+    except AttributeError:
+        pass
+
     main()

+ 20 - 4
PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/mem_transformer_jit.py

@@ -21,7 +21,6 @@ import torch.nn.functional as F
 
 from inference.proj_adaptive_softmax_jit import ProjectedAdaptiveLogSoftmax
 from utils.log_uniform_sampler import LogUniformSampler
-from utils.log_uniform_sampler import sample_logits
 
 
 class PositionalEmbedding(nn.Module):
@@ -212,7 +211,7 @@ class RelMultiHeadAttn(nn.Module):
 
         x_padded = x_padded.view(x.size(0), x.size(1), x.size(3) + 1, x.size(2))
 
-        x = x_padded[:, :, 1:].view_as(x)
+        x = x_padded.narrow(2, 1, x_padded.size(2) - 1).view_as(x)
 
         if zero_triu:
             ones = torch.ones((x.size(2), x.size(3)))
@@ -485,13 +484,29 @@ class AdaptiveEmbedding(nn.Module):
             self.emb_layers.append(
                 nn.Embedding(n_token, d_embed, sparse=(sample_softmax > 0))
             )
-            self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed)))
+            self.emb_projs.append(
+                nn.Parameter(
+                    torch.zeros(
+                        (d_proj, d_embed),
+                        dtype=dtype,
+                        device=torch.device('cuda'),
+                        )
+                    )
+                )
         else:
             for i in range(len(self.cutoffs)):
                 l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
                 d_emb_i = d_embed // (div_val ** i)
                 self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))
-                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i)))
+                self.emb_projs.append(
+                    nn.Parameter(
+                        torch.zeros(
+                            (d_proj, d_emb_i),
+                            dtype=dtype,
+                            device=torch.device('cuda'),
+                            )
+                        )
+                    )
 
     def forward(self, inp):
         if self.div_val == 1:
@@ -595,6 +610,7 @@ class MemTransformerLM(nn.Module):
 
             self.crit = ProjectedAdaptiveLogSoftmax(n_token, d_embed, d_model,
                                                     cutoffs, div_val=div_val,
+                                                    dtype=dtype,
                                                     tie_projs=tie_projs,
                                                     out_projs=emb_projs,
                                                     out_layers_weights=emb_layers)

+ 61 - 11
PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/proj_adaptive_softmax_jit.py

@@ -24,8 +24,8 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
     out_projs: List[Optional[torch.Tensor]]
 
     def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
-                 tie_projs=None, out_layers_weights=None, out_projs=None,
-                 keep_order=False):
+                 dtype=None, tie_projs=None, out_layers_weights=None,
+                 out_projs=None, keep_order=False):
         super().__init__()
 
         self.n_token = n_token
@@ -43,8 +43,20 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
         self.tie_projs = tie_projs
 
         if self.n_clusters > 0:
-            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
-            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
+            self.cluster_weight = nn.Parameter(
+                torch.zeros(
+                    self.n_clusters, self.d_embed,
+                    dtype=dtype,
+                    device=torch.device('cuda'),
+                    )
+                )
+            self.cluster_bias = nn.Parameter(
+                torch.zeros(
+                    self.n_clusters,
+                    dtype=dtype,
+                    device=torch.device('cuda'),
+                    )
+                )
 
         if not out_layers_weights:
             self.out_layers_weights = []
@@ -61,7 +73,13 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                     if tie_proj:
                         self.out_projs.append(out_projs[0])
                     else:
-                        self.out_projs.append(torch.zeros(d_proj, d_embed))
+                        self.out_projs.append(
+                            torch.zeros(
+                                d_proj, d_embed,
+                                dtype=dtype,
+                                device=torch.device('cuda'),
+                                )
+                            )
             else:
                 for i, tie_proj in enumerate(tie_projs):
                     self.out_projs.append(None)
@@ -71,15 +89,31 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                 if tie_proj:
                     self.out_projs.append(out_projs[i])
                 else:
-                    self.out_projs.append(torch.zeros(d_proj, d_emb_i))
+                    self.out_projs.append(
+                        torch.zeros(
+                            d_proj, d_emb_i,
+                            dtype=dtype,
+                            device=torch.device('cuda'),
+                            )
+                        )
 
         if div_val == 1:
             self.out_layers_biases.append(
-                (torch.zeros(n_token))
+                torch.zeros(
+                    n_token,
+                    dtype=dtype,
+                    device=torch.device('cuda'),
+                    )
                 )
             if not out_layers_weights:
                 self.out_layers_weights.append(
-                    nn.Parameter(torch.zeros(n_token, d_embed))
+                    nn.Parameter(
+                        torch.zeros(
+                            n_token, d_embed,
+                            dtype=dtype,
+                            device=torch.device('cuda'),
+                            )
+                        )
                     )
         else:
             for i in range(len(self.cutoffs)):
@@ -87,11 +121,23 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                 d_emb_i = d_embed // (div_val ** i)
 
                 self.out_layers_biases.append(
-                    nn.Parameter(torch.zeros(r_idx - l_idx))
+                    nn.Parameter(
+                        torch.zeros(
+                            r_idx - l_idx,
+                            dtype=dtype,
+                            device=torch.device('cuda'),
+                            )
+                        )
                     )
                 if not out_layers_weights:
                     self.out_layers_weights.append(
-                        nn.Parameter(torch.zeros(r_idx - l_idx, d_emb_i))
+                        nn.Parameter(
+                            torch.zeros(
+                                r_idx - l_idx, d_emb_i,
+                                dtype=dtype,
+                                device=torch.device('cuda'),
+                                )
+                            )
                         )
 
         self.keep_order = keep_order
@@ -146,7 +192,11 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
             head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
             head_logprob = F.log_softmax(head_logit, dim=1)
 
-            nll = torch.zeros_like(target, layout=torch.strided, dtype=hidden.dtype, device=hidden.device)
+            nll = torch.zeros_like(target,
+                                   layout=torch.strided,
+                                   dtype=hidden.dtype,
+                                   device=hidden.device,
+                                   )
 
             offset = 0
             cutoff_values = [0] + self.cutoffs

+ 2 - 1
PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py

@@ -155,12 +155,13 @@ def lamb_kernel(param, grad, exp_avg, exp_avg_sq, beta1: float,
     adam_step = exp_avg / (exp_avg_sq.sqrt() + eps)
     adam_step = adam_step + weight_decay * param
 
-    weight_norm = param.norm(p=2).clamp_(0, 10)
+    weight_norm = param.norm(p=2).clamp(0, 10)
     adam_norm = adam_step.norm(p=2)
 
     trust_ratio = weight_norm / (adam_norm + eps)
     trust_ratio = (weight_norm == 0.0) * 1.0 + (weight_norm != 0.0) * trust_ratio
     trust_ratio = (adam_norm == 0.0) * 1.0 + (adam_norm != 0.0) * trust_ratio
+    trust_ratio = trust_ratio.float()
 
     param = param - step_size * trust_ratio * adam_step
     return param, exp_avg, exp_avg_sq

+ 25 - 27
PyTorch/LanguageModeling/Transformer-XL/pytorch/mem_transformer.py

@@ -209,7 +209,7 @@ class RelMultiHeadAttn(nn.Module):
 
         x_padded = x_padded.view(x.size(0), x.size(1), x.size(3) + 1, x.size(2))
 
-        x = x_padded[:, :, 1:].view_as(x)
+        x = x_padded.narrow(2, 1, x_padded.size(2) - 1).view_as(x)
 
         if zero_triu:
             ones = torch.ones((x.size(2), x.size(3)))
@@ -470,13 +470,13 @@ class AdaptiveEmbedding(nn.Module):
                 nn.Embedding(n_token, d_embed, sparse=(sample_softmax > 0))
             )
             if d_proj != d_embed:
-                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed)))
+                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed).zero_()))
         else:
             for i in range(len(self.cutoffs)):
                 l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
                 d_emb_i = d_embed // (div_val ** i)
                 self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))
-                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i)))
+                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i).zero_()))
 
     def forward(self, inp):
         if self.div_val == 1:
@@ -608,23 +608,23 @@ class MemTransformerLM(nn.Module):
         # default attention
         if self.attn_type == 0:
             self.pos_emb = PositionalEmbedding(self.d_model)
-            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
-            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head).zero_())
+            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head).zero_())
         # learnable
         elif self.attn_type == 1:
             self.r_emb = nn.Parameter(torch.Tensor(
-                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+                    self.n_layer, self.max_klen, self.n_head, self.d_head).zero_())
             self.r_w_bias = nn.Parameter(torch.Tensor(
-                    self.n_layer, self.n_head, self.d_head))
+                    self.n_layer, self.n_head, self.d_head).zero_())
             self.r_bias = nn.Parameter(torch.Tensor(
-                    self.n_layer, self.max_klen, self.n_head))
+                    self.n_layer, self.max_klen, self.n_head).zero_())
         # absolute standard
         elif self.attn_type == 2:
             self.pos_emb = PositionalEmbedding(self.d_model)
         # absolute deeper SA
         elif self.attn_type == 3:
             self.r_emb = nn.Parameter(torch.Tensor(
-                    self.n_layer, self.max_klen, self.d_model))
+                    self.n_layer, self.max_klen, self.d_model).zero_())
 
     def reset_length(self, tgt_len, ext_len, mem_len):
         self.tgt_len = tgt_len
@@ -633,12 +633,9 @@ class MemTransformerLM(nn.Module):
 
     def init_mems(self):
         if self.mem_len > 0:
-            mems = []
             param = next(self.parameters())
-            for i in range(self.n_layer+1):
-                empty = torch.empty(0, dtype=param.dtype, device=param.device)
-                mems.append(empty)
-
+            mems = torch.empty(self.n_layer + 1, 0, dtype=param.dtype,
+                               device=param.device)
             return mems
         else:
             return None
@@ -657,13 +654,14 @@ class MemTransformerLM(nn.Module):
         # the tokens from `mlen + qlen - self.ext_len - self.mem_len`
         # to `mlen + qlen - self.ext_len`.
         with torch.no_grad():
-            new_mems = []
             end_idx = mlen + max(0, qlen - 0 - self.ext_len)
             beg_idx = max(0, end_idx - self.mem_len)
-            for i in range(len(hids)):
-
-                cat = torch.cat([mems[i], hids[i]], dim=0)
-                new_mems.append(cat[beg_idx:end_idx].detach())
+            stacked = torch.stack(hids)
+            if mems.numel():
+                cat = torch.cat([mems, stacked], dim=1)
+            else:
+                cat = stacked
+            new_mems = cat[:, beg_idx:end_idx].detach()
 
         return new_mems
 
@@ -699,17 +697,17 @@ class MemTransformerLM(nn.Module):
             core_out = self.drop(word_emb)
             pos_emb = self.drop(pos_emb)
 
-            hids.append(core_out)
+            hids.append(core_out.detach())
             for i, layer in enumerate(self.layers):
                 mems_i = None if mems is None else mems[i]
                 core_out = layer(core_out, pos_emb, self.r_w_bias,
                                  self.r_r_bias, dec_attn_mask=dec_attn_mask,
                                  mems=mems_i)
-                hids.append(core_out)
+                hids.append(core_out.detach())
         # learnable
         elif self.attn_type == 1:
             core_out = self.drop(word_emb)
-            hids.append(core_out)
+            hids.append(core_out.detach())
             for i, layer in enumerate(self.layers):
                 if self.clamp_len > 0:
                     r_emb = self.r_emb[i][-self.clamp_len:]
@@ -720,7 +718,7 @@ class MemTransformerLM(nn.Module):
                 mems_i = None if mems is None else mems[i]
                 core_out = layer(core_out, r_emb, self.r_w_bias[i],
                                  r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
-                hids.append(core_out)
+                hids.append(core_out.detach())
         # absolute
         elif self.attn_type == 2:
             pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
@@ -731,18 +729,18 @@ class MemTransformerLM(nn.Module):
 
             core_out = self.drop(word_emb + pos_emb[-qlen:])
 
-            hids.append(core_out)
+            hids.append(core_out.detach())
             for i, layer in enumerate(self.layers):
                 mems_i = None if mems is None else mems[i]
                 if mems_i is not None and len(mems_i) and i == 0:
                     mems_i += pos_emb[:mlen]
                 core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
                                  mems=mems_i)
-                hids.append(core_out)
+                hids.append(core_out.detach())
         elif self.attn_type == 3:
             core_out = self.drop(word_emb)
 
-            hids.append(core_out)
+            hids.append(core_out.detach())
             for i, layer in enumerate(self.layers):
                 mems_i = None if mems is None else mems[i]
                 if mems_i is not None and len(mems_i) and mlen > 0:
@@ -758,7 +756,7 @@ class MemTransformerLM(nn.Module):
 
                 core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
                                  mems=mems_i)
-                hids.append(core_out)
+                hids.append(core_out.detach())
 
         core_out = self.drop(core_out)
 

+ 4 - 5
PyTorch/LanguageModeling/Transformer-XL/pytorch/run.sub

@@ -2,6 +2,7 @@
 #SBATCH -N 8                       # number of nodes
 #SBATCH -t 4:00:00                 # wall time
 #SBATCH -J "transformer-xl_pyt"    # job name
+#SBATCH --ntasks-per-node=16       # tasks per node
 #SBATCH --exclusive                # exclusive node access
 #SBATCH --mem=0                    # all mem avail
 #SBATCH --mail-type=FAIL           # only send email on failure
@@ -21,11 +22,6 @@ MOUNTS="${WORK_DIR}/data:${CONT_WORK_DIR}/data,${WORK_DIR}/results:${CONT_WORK_D
 # Create directory for the results
 srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${WORK_DIR}/results"
 
-# Configure DGX-2H parameters
-export DGXNSOCKET=2
-export DGXSOCKETCORES=24
-export DGXNGPU=16
-
 # Overwrite default parameters in the script
 EXTRA_TRAIN_PARAMS=""
 EXTRA_EVAL_PARAMS=""
@@ -48,6 +44,9 @@ fi
 if [ -n "$EVAL_INTERVAL" ]; then
    EXTRA_TRAIN_PARAMS+="--eval_interval ${EVAL_INTERVAL} "
 fi
+if [ -n "$LOG_INTERVAL" ]; then
+   EXTRA_TRAIN_PARAMS+="--log_interval ${LOG_INTERVAL} "
+fi
 if [ -n "$EVAL_BATCH_SIZE" ]; then
    EXTRA_TRAIN_PARAMS+="--eval_batch_size ${EVAL_BATCH_SIZE} "
    EXTRA_EVAL_PARAMS+="--batch_size ${EVAL_BATCH_SIZE} "

+ 2 - 10
PyTorch/LanguageModeling/Transformer-XL/pytorch/run_multinode_wt103_large.sh

@@ -14,17 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-DISTRIBUTED="-m torch.distributed.launch --nnodes ${WORLD_SIZE} --node_rank ${SLURM_NODEID}  \
-    --master_addr ${MASTER_ADDR} --master_port ${MASTER_PORT} --nproc_per_node=${DGXNGPU}"
-
-echo "MASTER_ADDR ${MASTER_ADDR}"
-echo "MASTER_PORT ${MASTER_PORT}"
-echo "WORLD_SIZE ${WORLD_SIZE}"
-echo "SLURM_NODEID ${SLURM_NODEID}"
-
 if [[ $1 == 'train' ]] || [[ $1 == 'all' ]]; then
     echo 'Run training...'
-    python ${DISTRIBUTED} train.py \
+    python train.py \
         --config_file wt103_large.yaml \
         --config 8dgx2_16gpu_fp16 \
         ${@:2}
@@ -32,7 +24,7 @@ fi
 
 if [[ $1 == 'eval' ]] || [[ $1 == 'all' ]]; then
     echo 'Run evaluation...'
-    python ${DISTRIBUTED} eval.py \
+    python eval.py \
         --config_file wt103_large.yaml \
         --config 8dgx2_16gpu_fp16 \
         ${@:2}

+ 1 - 1
PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/docker/interactive.sh

@@ -14,4 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-docker run --gpus all --init -it --rm --network=host --ipc=host -v $(dirname $PWD):/workspace/transformer-xl transformer-xl bash
+docker run --gpus all --init -it --rm --network=host --ipc=host -v $PWD:/workspace/transformer-xl transformer-xl bash

+ 8 - 0
PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/inference_benchmark.sh

@@ -16,11 +16,13 @@
 
 CHECKPOINT=${CHECKPOINT:-"LM-TFM/checkpoint_best.pt"}
 MODEL=${MODEL:-"base"}
+GPU=${GPU:-"v100"}
 
 BATCH_SIZES=(1 2 4 8 16 32)
 TYPES=("pytorch" "torchscript")
 # "empty" MATH corresponds to fp32
 MATHS=("" "--fp16")
+MATHS_FULL=("fp32" "fp16")
 
 
 for (( i = 0; i < ${#TYPES[@]}; i++ )); do
@@ -28,10 +30,16 @@ for (( i = 0; i < ${#TYPES[@]}; i++ )); do
       for (( k = 0; k < ${#MATHS[@]}; k++ )); do
          echo type: ${TYPES[i]} batch size: ${BATCH_SIZES[j]} math: ${MATHS[k]}
 
+         DIR="LM-TFM/inference/${GPU}_${BATCH_SIZES[j]}_${MATHS_FULL[k]}_${TYPES[i]}"
+         mkdir -p "${DIR}"
+
          taskset -c 0 bash run_wt103_"${MODEL}".sh eval 1 \
+            --work_dir "${DIR}" \
             --model "${CHECKPOINT}" \
             --type "${TYPES[i]}" \
             --batch_size "${BATCH_SIZES[j]}" \
+            --log_interval 1 \
+            --no_env \
             "${MATHS[k]}" \
             --save_data \
             "${@:1}"

+ 5 - 6
PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/infer_bench.sh

@@ -44,11 +44,10 @@ GPU_NAME=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader |uniq)
 echo 'GPU_NAME:' "${GPU_NAME}"
 GPU_COUNT=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader |wc -l)
 echo 'GPU_COUNT:' "${GPU_COUNT}"
-GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader |head -n 1 |cut -f 1 -d " ")
-echo 'GPU_MEM:' "${GPU_MEM}"
 
-REFERENCE_PERF=$(grep "${MATH},${BATCH_SIZE},${GPU_NAME}" \
-   ${REFERENCE_FILE} | \cut -f 4 -d ',')
+REFERENCE_PERF=$(grep "${MATH},${BATCH_SIZE},${TYPE},${GPU_NAME}" \
+   ${REFERENCE_FILE} | \cut -f 5 -d ',')
+
 
 if [ -z "${REFERENCE_PERF}" ]; then
    echo "WARNING: COULD NOT FIND REFERENCE PERFORMANCE FOR EXECUTED CONFIG"
@@ -67,5 +66,5 @@ bash run_wt103_base.sh eval 1 \
    --target_perplexity 23.4 \
    --batch_size "${BATCH_SIZE}" \
    --type "${TYPE}" \
-   "${MATH_OPT}" \
-   "${TARGET_PERF}"
+   ${MATH_OPT} \
+   ${TARGET_PERF}

+ 12 - 6
PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/reference_inference_throughput

@@ -1,6 +1,12 @@
-fp16,16,Tesla V100-SXM2-16GB,40000
-fp32,16,Tesla V100-SXM2-16GB,18750
-fp16,16,Tesla V100-SXM2-32GB,40000
-fp32,16,Tesla V100-SXM2-32GB,18750
-fp16,16,Tesla V100-SXM3-32GB,40000
-fp32,16,Tesla V100-SXM3-32GB,18750
+fp16,16,pytorch,Tesla V100-SXM2-16GB,42000
+fp32,16,pytorch,Tesla V100-SXM2-16GB,20000
+fp16,16,pytorch,Tesla V100-SXM2-32GB,42000
+fp32,16,pytorch,Tesla V100-SXM2-32GB,20000
+fp16,16,pytorch,Tesla V100-SXM3-32GB,42000
+fp32,16,pytorch,Tesla V100-SXM3-32GB,20000
+fp16,16,torchscript,Tesla V100-SXM2-16GB,54000
+fp32,16,torchscript,Tesla V100-SXM2-16GB,20000
+fp16,16,torchscript,Tesla V100-SXM2-32GB,54000
+fp32,16,torchscript,Tesla V100-SXM2-32GB,20000
+fp16,16,torchscript,Tesla V100-SXM3-32GB,54000
+fp32,16,torchscript,Tesla V100-SXM3-32GB,20000

+ 10 - 10
PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/reference_training_throughput

@@ -1,10 +1,10 @@
-fp16,4,Tesla V100-SXM2-16GB,126000
-fp32,4,Tesla V100-SXM2-16GB,45000
-fp16,4,Tesla V100-SXM2-32GB,126000
-fp32,4,Tesla V100-SXM2-32GB,45000
-fp16,8,Tesla V100-SXM2-16GB,233000
-fp32,8,Tesla V100-SXM2-16GB,88000
-fp16,8,Tesla V100-SXM2-32GB,233000
-fp32,8,Tesla V100-SXM2-32GB,88000
-fp16,16,Tesla V100-SXM3-32GB,356000
-fp32,16,Tesla V100-SXM3-32GB,176000
+fp16,4,Tesla V100-SXM2-16GB,141000
+fp32,4,Tesla V100-SXM2-16GB,51500
+fp16,4,Tesla V100-SXM2-32GB,141000
+fp32,4,Tesla V100-SXM2-32GB,51500
+fp16,8,Tesla V100-SXM2-16GB,257000
+fp32,8,Tesla V100-SXM2-16GB,97000
+fp16,8,Tesla V100-SXM2-32GB,257000
+fp32,8,Tesla V100-SXM2-32GB,97000
+fp16,16,Tesla V100-SXM3-32GB,388000
+fp32,16,Tesla V100-SXM3-32GB,205000

+ 7 - 25
PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/train_bench.sh

@@ -25,35 +25,19 @@ if [[ ${MATH} != "fp16" && ${MATH} != "fp32" ]]; then
    exit 1
 fi
 
-if [[ ${MATH} == 'fp16' ]]; then
-   MATH_OPT='--fp16'
-elif [[ ${MATH} == 'fp32' ]]; then
-   MATH_OPT=''
-fi
-
 PERF_TOLERANCE=0.9
-GLOBAL_BATCH_SIZE=256
 
 GPU_NAME=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader |uniq)
 echo 'GPU_NAME:' "${GPU_NAME}"
 GPU_COUNT=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader |wc -l)
 echo 'GPU_COUNT:' "${GPU_COUNT}"
-GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader |head -n 1 |cut -f 1 -d " ")
-echo 'GPU_MEM:' "${GPU_MEM}"
 
-if (( GPU_MEM > 16500 )); then
-   LOCAL_BATCH_SIZE=32
+if (( GPU_COUNT == 16 )); then
+   SYSTEM=dgx2
 else
-   if [[ ${MATH} == 'fp16' ]]; then
-      LOCAL_BATCH_SIZE=32
-   elif [[ ${MATH} == 'fp32' ]]; then
-      LOCAL_BATCH_SIZE=16
-   fi
+   SYSTEM=dgx1
 fi
 
-BATCH_CHUNK=$((GLOBAL_BATCH_SIZE / (GPU_COUNT * LOCAL_BATCH_SIZE)))
-BATCH_CHUNK=$((BATCH_CHUNK < 1 ? 1 : BATCH_CHUNK))
-
 REFERENCE_PERF=$(grep "${MATH},${GPU_COUNT},${GPU_NAME}" \
    ${REFERENCE_FILE} | \cut -f 4 -d ',')
 
@@ -68,11 +52,9 @@ fi
 cd $REPO_DIR
 
 bash run_wt103_base.sh train "${GPU_COUNT}" \
+   --config ${SYSTEM}_${GPU_COUNT}gpu_${MATH} \
+   --max_step $((512 / GPU_COUNT)) \
    --debug \
-   --max_step $((256 / GPU_COUNT)) \
-   --batch_chunk "${BATCH_CHUNK}" \
+   --no_eval \
    --log_interval 1 \
-   --adaptive \
-   --vocab word \
-   "${MATH_OPT}" \
-   "${TARGET_PERF}"
+   ${TARGET_PERF}

+ 5 - 25
PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/train_full.sh

@@ -25,35 +25,19 @@ if [[ ${MATH} != "fp16" && ${MATH} != "fp32" ]]; then
    exit 1
 fi
 
-if [[ ${MATH} == 'fp16' ]]; then
-   MATH_OPT='--fp16'
-elif [[ ${MATH} == 'fp32' ]]; then
-   MATH_OPT=''
-fi
-
 PERF_TOLERANCE=0.9
-GLOBAL_BATCH_SIZE=256
 
 GPU_NAME=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader |uniq)
 echo 'GPU_NAME:' "${GPU_NAME}"
 GPU_COUNT=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader |wc -l)
 echo 'GPU_COUNT:' "${GPU_COUNT}"
-GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader |head -n 1 |cut -f 1 -d " ")
-echo 'GPU_MEM:' "${GPU_MEM}"
 
-if (( GPU_MEM > 16500 )); then
-   LOCAL_BATCH_SIZE=32
+if (( GPU_COUNT == 16 )); then
+   SYSTEM=dgx2
 else
-   if [[ ${MATH} == 'fp16' ]]; then
-      LOCAL_BATCH_SIZE=32
-   elif [[ ${MATH} == 'fp32' ]]; then
-      LOCAL_BATCH_SIZE=16
-   fi
+   SYSTEM=dgx1
 fi
 
-BATCH_CHUNK=$((GLOBAL_BATCH_SIZE / (GPU_COUNT * LOCAL_BATCH_SIZE)))
-BATCH_CHUNK=$((BATCH_CHUNK < 1 ? 1 : BATCH_CHUNK))
-
 REFERENCE_PERF=$(grep "${MATH},${GPU_COUNT},${GPU_NAME}" \
    ${REFERENCE_FILE} | \cut -f 4 -d ',')
 
@@ -68,12 +52,8 @@ fi
 cd $REPO_DIR
 
 bash run_wt103_base.sh train "${GPU_COUNT}" \
+   --config ${SYSTEM}_${GPU_COUNT}gpu_${MATH} \
    --debug \
-   --max_step 40000 \
    --target_perplexity 23.4 \
-   --batch_chunk "${BATCH_CHUNK}" \
    --log_interval 1 \
-   --adaptive \
-   --vocab word \
-   "${MATH_OPT}" \
-   "${TARGET_PERF}"
+   ${TARGET_PERF}

+ 5 - 24
PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/train_long.sh

@@ -25,35 +25,19 @@ if [[ ${MATH} != "fp16" && ${MATH} != "fp32" ]]; then
    exit 1
 fi
 
-if [[ ${MATH} == 'fp16' ]]; then
-   MATH_OPT='--fp16'
-elif [[ ${MATH} == 'fp32' ]]; then
-   MATH_OPT=''
-fi
-
 PERF_TOLERANCE=0.9
-GLOBAL_BATCH_SIZE=256
 
 GPU_NAME=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader |uniq)
 echo 'GPU_NAME:' "${GPU_NAME}"
 GPU_COUNT=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader |wc -l)
 echo 'GPU_COUNT:' "${GPU_COUNT}"
-GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader |head -n 1 |cut -f 1 -d " ")
-echo 'GPU_MEM:' "${GPU_MEM}"
 
-if (( GPU_MEM > 16500 )); then
-   LOCAL_BATCH_SIZE=32
+if (( GPU_COUNT == 16 )); then
+   SYSTEM=dgx2
 else
-   if [[ ${MATH} == 'fp16' ]]; then
-      LOCAL_BATCH_SIZE=32
-   elif [[ ${MATH} == 'fp32' ]]; then
-      LOCAL_BATCH_SIZE=16
-   fi
+   SYSTEM=dgx1
 fi
 
-BATCH_CHUNK=$((GLOBAL_BATCH_SIZE / (GPU_COUNT * LOCAL_BATCH_SIZE)))
-BATCH_CHUNK=$((BATCH_CHUNK < 1 ? 1 : BATCH_CHUNK))
-
 REFERENCE_PERF=$(grep "${MATH},${GPU_COUNT},${GPU_NAME}" \
    ${REFERENCE_FILE} | \cut -f 4 -d ',')
 
@@ -68,13 +52,10 @@ fi
 cd $REPO_DIR
 
 bash run_wt103_base.sh train "${GPU_COUNT}" \
+   --config ${SYSTEM}_${GPU_COUNT}gpu_${MATH} \
    --debug \
    --max_step 30000 \
    --max_step_scheduler 40000 \
    --target_perplexity 24.2 \
-   --batch_chunk "${BATCH_CHUNK}" \
    --log_interval 1 \
-   --adaptive \
-   --vocab word \
-   "${MATH_OPT}" \
-   "${TARGET_PERF}"
+   ${TARGET_PERF}

+ 5 - 24
PyTorch/LanguageModeling/Transformer-XL/pytorch/scripts/tests/train_short.sh

@@ -25,35 +25,19 @@ if [[ ${MATH} != "fp16" && ${MATH} != "fp32" ]]; then
    exit 1
 fi
 
-if [[ ${MATH} == 'fp16' ]]; then
-   MATH_OPT='--fp16'
-elif [[ ${MATH} == 'fp32' ]]; then
-   MATH_OPT=''
-fi
-
 PERF_TOLERANCE=0.9
-GLOBAL_BATCH_SIZE=256
 
 GPU_NAME=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader |uniq)
 echo 'GPU_NAME:' "${GPU_NAME}"
 GPU_COUNT=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader |wc -l)
 echo 'GPU_COUNT:' "${GPU_COUNT}"
-GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader |head -n 1 |cut -f 1 -d " ")
-echo 'GPU_MEM:' "${GPU_MEM}"
 
-if (( GPU_MEM > 16500 )); then
-   LOCAL_BATCH_SIZE=32
+if (( GPU_COUNT == 16 )); then
+   SYSTEM=dgx2
 else
-   if [[ ${MATH} == 'fp16' ]]; then
-      LOCAL_BATCH_SIZE=32
-   elif [[ ${MATH} == 'fp32' ]]; then
-      LOCAL_BATCH_SIZE=16
-   fi
+   SYSTEM=dgx1
 fi
 
-BATCH_CHUNK=$((GLOBAL_BATCH_SIZE / (GPU_COUNT * LOCAL_BATCH_SIZE)))
-BATCH_CHUNK=$((BATCH_CHUNK < 1 ? 1 : BATCH_CHUNK))
-
 REFERENCE_PERF=$(grep "${MATH},${GPU_COUNT},${GPU_NAME}" \
    ${REFERENCE_FILE} | \cut -f 4 -d ',')
 
@@ -68,13 +52,10 @@ fi
 cd $REPO_DIR
 
 bash run_wt103_base.sh train "${GPU_COUNT}" \
+   --config ${SYSTEM}_${GPU_COUNT}gpu_${MATH} \
    --debug \
    --max_step 5000 \
    --max_step_scheduler 40000 \
    --target_perplexity 43.5 \
-   --batch_chunk "${BATCH_CHUNK}" \
    --log_interval 1 \
-   --adaptive \
-   --vocab word \
-   "${MATH_OPT}" \
-   "${TARGET_PERF}"
+   ${TARGET_PERF}

+ 144 - 66
PyTorch/LanguageModeling/Transformer-XL/pytorch/train.py

@@ -20,6 +20,7 @@ import itertools
 import logging
 import math
 import os
+import shutil
 import sys
 import time
 
@@ -38,9 +39,12 @@ from data_utils import get_lm_corpus
 from mem_transformer import MemTransformerLM
 from utils.data_parallel import BalancedDataParallel
 from utils.exp_utils import AverageMeter
+from utils.exp_utils import TimeoutHandler
 from utils.exp_utils import benchmark
 from utils.exp_utils import create_exp_dir
+from utils.exp_utils import l2_promote
 from utils.exp_utils import log_env_info
+from utils.exp_utils import register_ignoring_timeout_handler
 
 
 def parse_args():
@@ -54,7 +58,7 @@ def parse_args():
     cfg_parser = argparse.ArgumentParser(parents=[parent_parser], add_help=False)
 
     cfg_parser.add_argument('--config', default='default')
-    cfg_parser.add_argument('--config_file', default='config.yaml')
+    cfg_parser.add_argument('--config_file', default=None)
 
     config_args, _ = cfg_parser.parse_known_args()
 
@@ -81,16 +85,25 @@ def parse_args():
                          help='Run in debug mode (do not create exp dir)')
     general.add_argument('--log_all_ranks', action='store_true',
                          help='Enable logging from all distributed ranks')
-    general.add_argument('--save-all', action='store_true',
+    general.add_argument('--dllog_file', type=str, default='train_log.json',
+                         help='Name of the DLLogger output file')
+    general.add_argument('--txtlog_file', type=str, default='train_log.log',
+                         help='Name of the txt log file')
+    general.add_argument('--save_all', action='store_true',
                          help='Save all checkpoints')
     general.add_argument('--no_env', action='store_true',
                          help='Do not print info on execution env')
+    general.add_argument('--no_eval', action='store_true',
+                         help='Disable model evaluation')
     general.add_argument('--log_interval', type=int, default=10,
                          help='Report interval')
     general.add_argument('--target_throughput', type=float, default=None,
                          help='Target training throughput (for benchmarking)')
     general.add_argument('--target_perplexity', type=float, default=None,
                          help='Target validation perplexity (for benchmarking)')
+    general.add_argument('--amp_mode', type=str, default='O2',
+                         choices=['O0', 'O1', 'O2', 'O3'],
+                         help='Optimization level for apex amp')
 
     dataset = parser.add_argument_group('dataset setup')
     dataset.add_argument('--data', type=str, default='../data/wikitext-103',
@@ -238,7 +251,8 @@ def parse_args():
 
 
 def save_checkpoint(args, model, model_config, optimizer, scheduler, vocab,
-                    train_step, best_val_loss, work_dir, name='checkpoint.pt'):
+                    epoch, batch, last_iter, train_step, best_val_loss,
+                    is_best, work_dir):
     if args.fp16:
         amp_state = amp.state_dict()
     else:
@@ -252,15 +266,35 @@ def save_checkpoint(args, model, model_config, optimizer, scheduler, vocab,
         'scheduler_state': scheduler.state_dict(),
         'vocab': vocab,
         'amp_state': amp_state,
+        'epoch': epoch,
+        'batch': batch,
+        'last_iter': last_iter,
         'train_step': train_step,
         'best_val_loss': best_val_loss,
         }
 
+    last_chkpt_fname = 'checkpoint_last.pt'
+
     with utils.distributed.sync_workers() as rank:
-        path = os.path.join(work_dir, name)
-        logging.info(f'Saving checkpoint to {path}')
+        last_chkpt_path = os.path.join(work_dir, last_chkpt_fname)
         if rank == 0:
-            torch.save(state, path)
+            # always save last checkpoint
+            logging.info(f'Saving checkpoint to {last_chkpt_path}')
+            torch.save(state, last_chkpt_path)
+
+            # save best checkpoint if better than previous best
+            if is_best:
+                best_chkpt_fname = 'checkpoint_best.pt'
+                best_chkpt_path = os.path.join(work_dir, best_chkpt_fname)
+                logging.info(f'Saving checkpoint to {best_chkpt_path}')
+                shutil.copy(last_chkpt_path, best_chkpt_path)
+
+            # save every checkpoint if save_all is true
+            if args.save_all:
+                step_chkpt_fname = f'checkpoint_{train_step}.pt'
+                step_chkpt_path = os.path.join(work_dir, step_chkpt_fname)
+                logging.info(f'Saving checkpoint to {step_chkpt_path}')
+                shutil.copy(last_chkpt_path, step_chkpt_path)
 
 
 def load_checkpoint(path):
@@ -367,7 +401,7 @@ def evaluate(eval_iter, model, args):
             loss, mems = model(data, target, mems)
             loss = loss.float().mean()
             if warm:
-                assert (not mems) or all([m.size(0) == model.mem_len for m in mems])
+                assert (mems is None) or mems.size(1) == model.mem_len
                 total_loss += seq_len * loss.item()
                 total_len += seq_len
 
@@ -382,8 +416,9 @@ def evaluate(eval_iter, model, args):
 
 
 def train(tr_iter, va_iter, model, para_model, model_config, optimizer,
-          optimizer_sparse, scheduler, scheduler_sparse, vocab, epoch, train_step,
-          best_val_loss, meters, args):
+          optimizer_sparse, scheduler, scheduler_sparse, vocab, epoch,
+          last_batch, last_iter, train_step, best_val_loss, meters,
+          timeout_handler, args):
     # Turn on training mode which enables dropout.
     model.train()
 
@@ -393,13 +428,17 @@ def train(tr_iter, va_iter, model, para_model, model_config, optimizer,
     log_start_time = time.time()
 
     mems = [None for _ in range(args.batch_chunk)]
-    train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter
+    if args.varlen:
+        train_iter = tr_iter.get_varlen_iter(start=last_iter)
+    else:
+        train_iter = tr_iter.get_fixlen_iter(start=last_iter)
 
-    for batch, (data, target, seq_len, _) in enumerate(train_iter):
+    for batch, (data, target, seq_len, _) in enumerate(train_iter, start=last_batch+1):
         log_step += 1
         target_tokens += target.numel()
 
-        model.zero_grad()
+        for param in model.parameters():
+            param.grad = None
 
         data_chunks = torch.chunk(data, args.batch_chunk, 1)
         target_chunks = torch.chunk(target, args.batch_chunk, 1)
@@ -467,7 +506,7 @@ def train(tr_iter, va_iter, model, para_model, model_config, optimizer,
                 '| ms/batch {:5.1f} | tok/s {:7.0f} | loss {:5.2f}'.format(
                     epoch,
                     train_step,
-                    batch+1,
+                    batch,
                     tr_iter.n_batch,
                     lr,
                     avg_elapsed * 1000,
@@ -492,9 +531,13 @@ def train(tr_iter, va_iter, model, para_model, model_config, optimizer,
                 dllogger_data['train_perplexity'] = math.exp(cur_loss)
 
             logging.info(log_str)
-            dllogger.log(step=train_step, data=dllogger_data)
+            dllogger.log(step=tuple([train_step]), data=dllogger_data)
+
+        do_periodic_eval = train_step % args.eval_interval == 0
+        is_final_step = train_step == args.max_step
+        interrupted = timeout_handler.interrupted
 
-        if train_step % args.eval_interval == 0:
+        if (do_periodic_eval or is_final_step or interrupted) and not args.no_eval:
             eval_start_time = time.time()
             val_loss = evaluate(va_iter, model, args)
             val_loss = utils.distributed.all_reduce_item(val_loss, op='mean')
@@ -521,30 +564,21 @@ def train(tr_iter, va_iter, model, para_model, model_config, optimizer,
                 dllogger_data['valid_perplexity'] = math.exp(val_loss)
             logging.info(log_str)
             logging.info('-' * 100)
-            dllogger.log(step=train_step, data=dllogger_data)
+            dllogger.log(step=tuple([train_step]), data=dllogger_data)
+
+            last_iter = tr_iter.last_iter
 
-            # Save the model if the validation loss is the best we've seen so far.
+            # Check if the validation loss is the best we've seen so far.
+            is_best = False
             if not best_val_loss or val_loss < best_val_loss:
                 best_val_loss = val_loss
-                if not args.debug:
-                    name = 'checkpoint_best.pt'
-                    save_checkpoint(args, model, model_config, optimizer,
-                                    scheduler, vocab, train_step,
-                                    best_val_loss, args.work_dir, name)
-
-            # Always save after eval if save_all is true and not debug
-            if not args.debug and args.save_all:
-                name = f'checkpoint_{train_step}.pt'
-                save_checkpoint(args, model, model_config, optimizer,
-                                scheduler, vocab, train_step, best_val_loss,
-                                args.work_dir, name)
+                is_best = True
 
-            # Save last checkpoint if not debug and not save_all
-            if not args.debug and not args.save_all:
-                name = 'checkpoint_last.pt'
+            if not args.debug:
                 save_checkpoint(args, model, model_config, optimizer,
-                                scheduler, vocab, train_step, best_val_loss,
-                                args.work_dir, name)
+                                scheduler, vocab, epoch, batch, last_iter,
+                                train_step, best_val_loss, is_best,
+                                args.work_dir)
 
             # dev-performance based learning rate annealing
             if args.scheduler == 'dev_perf':
@@ -555,16 +589,22 @@ def train(tr_iter, va_iter, model, para_model, model_config, optimizer,
             # subtract eval time from timers for training
             log_start_time += time.time() - eval_start_time
 
-        if train_step == args.max_step:
+        if interrupted:
+            logging.info(f'Received SIGTERM, exiting')
+            sys.exit(0)
+
+        if is_final_step:
             break
     return train_step, best_val_loss
 
 
 def main():
     args = parse_args()
+    utils.gpu_affinity.set_affinity(args.local_rank)
 
     # Initialize device and distributed backend
     torch.cuda.set_device(args.local_rank)
+    l2_promote()
     device = torch.device('cuda' if args.cuda else 'cpu')
     utils.distributed.init_distributed(args.cuda)
 
@@ -584,8 +624,8 @@ def main():
     if args.log_all_ranks:
         log_file = f'train_log_rank_{utils.distributed.get_rank()}.log'
     else:
-        log_file = f'train_log.log'
-    dllog_file = f'train_log.json'
+        log_file = args.txtlog_file
+    dllog_file = args.dllog_file
     log_file = os.path.join(args.work_dir, log_file)
     dllog_file = os.path.join(args.work_dir, dllog_file)
 
@@ -607,9 +647,13 @@ def main():
     logging.info(args)
     dllogger.log(step='PARAMETER', data=vars(args))
 
+    logging.info(f'world size: {utils.distributed.get_world_size()}')
+
     if not args.no_env:
         log_env_info()
 
+    register_ignoring_timeout_handler()
+
     # Set the random seed manually for reproducibility.
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
@@ -732,7 +776,7 @@ def main():
         model, optimizer = amp.initialize(
             model,
             optimizer,
-            opt_level='O2',
+            opt_level=args.amp_mode,
             )
 
     if args.multi_gpu == 'ddp' and torch.distributed.is_initialized():
@@ -806,20 +850,36 @@ def main():
     logging.info('#non emb params = {}'.format(args.n_nonemb_param))
 
     train_step = 0
+    start_epoch = 1
+    last_batch = 0
+    last_iter = 0
     best_val_loss = None
 
     if args.restart:
-        checkpoint = load_checkpoint(args.restart)
-        model.load_state_dict(checkpoint['model_state'])
-        optimizer.load_state_dict(checkpoint['optimizer_state'])
-        scheduler.load_state_dict(checkpoint['scheduler_state'])
-        if args.fp16:
-            amp.load_state_dict(checkpoint['amp_state'])
-        train_step = checkpoint['train_step']
-        best_val_loss = checkpoint['best_val_loss']
-
-        model.apply(functools.partial(update_dropout, args=args))
-        model.apply(functools.partial(update_dropatt, args=args))
+        try:
+            checkpoint = load_checkpoint(args.restart)
+            model.load_state_dict(checkpoint['model_state'])
+            optimizer.load_state_dict(checkpoint['optimizer_state'])
+            scheduler.load_state_dict(checkpoint['scheduler_state'])
+            if args.fp16:
+                amp.load_state_dict(checkpoint['amp_state'])
+            train_step = checkpoint['train_step']
+            start_epoch = checkpoint['epoch']
+            last_batch = checkpoint['batch']
+            last_iter = checkpoint['last_iter']
+            best_val_loss = checkpoint['best_val_loss']
+
+            if train_step >= args.max_step:
+                logging.info(f'Loaded checkpoint after {train_step} steps, but '
+                            f'this run was scheduled for a total of '
+                            f'{args.max_step} steps, exiting')
+                sys.exit(1)
+
+            model.apply(functools.partial(update_dropout, args=args))
+            model.apply(functools.partial(update_dropatt, args=args))
+        except FileNotFoundError:
+            logging.info(f'Could not load checkpoint from {args.restart}, '
+                         f'starting training from random init')
 
     meters = {}
     warmup = args.mem_len // args.tgt_len + 2
@@ -830,23 +890,28 @@ def main():
     # Loop over epochs.
     # At any point you can hit Ctrl + C to break out of training early.
     start_time = time.time()
-    try:
-        for epoch in itertools.count(start=1):
-            if args.roll:
-                tr_iter.roll()
-            train_step, best_val_loss = train(
-                tr_iter, va_iter, model, para_model, model_config, optimizer,
-                optimizer_sparse, scheduler, scheduler_sparse, vocab, epoch,
-                train_step, best_val_loss, meters, args
-                )
+    with TimeoutHandler() as timeout_handler:
+        try:
+            for epoch in itertools.count(start=start_epoch):
+                if args.roll:
+                    tr_iter.roll(seed=args.seed + epoch)
+                train_step, best_val_loss = train(
+                    tr_iter, va_iter, model, para_model, model_config,
+                    optimizer, optimizer_sparse, scheduler, scheduler_sparse,
+                    vocab, epoch, last_batch, last_iter, train_step,
+                    best_val_loss, meters, timeout_handler, args
+                    )
 
-            if train_step == args.max_step:
-                logging.info('-' * 100)
-                logging.info('End of training')
-                break
-    except KeyboardInterrupt:
-        logging.info('-' * 100)
-        logging.info('Exiting from training early')
+                last_batch = 0
+                last_iter = 0
+
+                if train_step == args.max_step:
+                    logging.info('-' * 100)
+                    logging.info('End of training')
+                    break
+        except KeyboardInterrupt:
+            logging.info('-' * 100)
+            logging.info('Exiting from training early')
     elapsed = time.time() - start_time
 
     ###########################################################################
@@ -854,7 +919,7 @@ def main():
     ###########################################################################
     summary = {}
     test_path = os.path.join(args.work_dir, 'checkpoint_best.pt')
-    if not args.debug and os.path.exists(test_path):
+    if not args.debug and not args.no_eval and os.path.exists(test_path):
         # Load the best saved model.
         checkpoint = load_checkpoint(test_path)
         model.load_state_dict(checkpoint['model_state'])
@@ -911,4 +976,17 @@ def main():
 
 
 if __name__ == "__main__":
+    # Disable profiling executor
+    try:
+        torch._C._jit_set_profiling_executor(False)
+        torch._C._jit_set_profiling_mode(False)
+    except AttributeError:
+        pass
+
+    # Before we do anything with models, we want to ensure that we get fp16
+    # execution of torch.einsum.
+    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations.
+    # Note that running `--amp_mode O2` will remove the need for this
+    # code, but it is still valid.
+    amp.register_half_function(torch, 'einsum')
     main()

+ 1 - 0
PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/__init__.py

@@ -14,3 +14,4 @@
 
 from . import distributed
 from . import exp_utils
+from . import gpu_affinity

+ 47 - 0
PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/exp_utils.py

@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import ctypes
 import datetime
 import logging
 import os
 import shutil
+import signal
 import sys
 import time
 
@@ -54,6 +56,41 @@ class AverageMeter:
                 self.vals.append(val)
 
 
+class TimeoutHandler:
+    def __init__(self, sig=signal.SIGTERM):
+        self.sig = sig
+
+    def __enter__(self):
+        self.interrupted = False
+        self.released = False
+        self.original_handler = signal.getsignal(self.sig)
+
+        def handler(signum, frame):
+            self.release()
+            self.interrupted = True
+            logging.info(f'Received SIGTERM')
+
+        signal.signal(self.sig, handler)
+        return self
+
+    def __exit__(self, type, value, tb):
+        self.release()
+
+    def release(self):
+        if self.released:
+            return False
+
+        signal.signal(self.sig, self.original_handler)
+        self.released = True
+        return True
+
+
+def register_ignoring_timeout_handler(sig=signal.SIGTERM):
+    def handler(signum, frame):
+        logging.info('Received SIGTERM, ignoring')
+    signal.signal(sig, handler)
+
+
 def log_env_info():
     """
     Prints information about execution environment.
@@ -175,3 +212,13 @@ def build_work_dir_name(work_dir, dataset, append_dataset, append_time):
 
         work_dir = os.path.join(work_dir, now_str)
     return work_dir
+
+
+def l2_promote():
+    _libcudart = ctypes.CDLL('libcudart.so')
+    # Set device limit on the current device
+    # cudaLimitMaxL2FetchGranularity = 0x05
+    pValue = ctypes.cast((ctypes.c_int*1)(), ctypes.POINTER(ctypes.c_int))
+    _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
+    _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
+    assert pValue.contents.value == 128

+ 49 - 0
PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/gpu_affinity.py

@@ -0,0 +1,49 @@
+import math
+import os
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        return [i for i, e in enumerate(affinity_list) if e != 0]
+
+
+def set_affinity(gpu_id=None):
+    if gpu_id is None:
+        gpu_id = int(os.getenv('LOCAL_RANK', 0))
+
+    dev = device(gpu_id)
+    os.sched_setaffinity(0, dev.getCpuAffinity())
+
+    # list of ints representing the logical cores this process is now affinitied with
+    return os.sched_getaffinity(0)

+ 15 - 1
PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/proj_adaptive_softmax.py

@@ -17,6 +17,20 @@ import torch.nn as nn
 import torch.nn.functional as F
 
 
+class OptionalParameterList(nn.ParameterList):
+    def extra_repr(self):
+        child_lines = []
+        for k, p in self._parameters.items():
+            if p is not None:
+                size_str = 'x'.join(str(size) for size in p.size())
+                device_str = '' if not p.is_cuda else ' (GPU {})'.format(p.get_device())
+                parastr = 'Parameter containing: [{} of size {}{}]'.format(
+                    torch.typename(p), size_str, device_str)
+                child_lines.append('  (' + str(k) + '): ' + parastr)
+        tmpstr = '\n'.join(child_lines)
+        return tmpstr
+
+
 class ProjectedAdaptiveLogSoftmax(nn.Module):
     def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
                  tie_projs=None, out_layers_weights=None, out_projs=None,
@@ -49,7 +63,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
         self.out_layers_biases = nn.ParameterList()
 
         self.shared_out_projs = out_projs
-        self.out_projs = nn.ParameterList()
+        self.out_projs = OptionalParameterList()
 
         if div_val == 1:
             if d_proj != d_embed:

+ 64 - 2
PyTorch/LanguageModeling/Transformer-XL/pytorch/wt103_base.yaml

@@ -45,7 +45,7 @@ default:
       <<: *eval
 
 
-# Full training configs for DGX-1 (8x V100 16G)
+# Full training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
 dgx1_8gpu_fp16: &dgx1_8gpu_fp16
    train:
       <<: *train
@@ -109,7 +109,7 @@ dgx1_1gpu_fp32: &dgx1_1gpu_fp32
    eval:
       <<: *eval
 
-# Full training configs for DGX-2 (16x V100 32G)
+# Full training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
 dgx2_16gpu_fp16: &dgx2_16gpu_fp16
    train:
       <<: *train
@@ -188,6 +188,68 @@ dgx2_1gpu_fp32: &dgx2_1gpu_fp32
    eval:
       <<: *eval
 
+# Full training configs for NVIDIA DGX A100 (8x NVIDIA A100 40GB GPU)
+dgxa100_8gpu_fp16: &dgxa100_8gpu_fp16
+   train:
+      <<: *train
+      fp16: true
+   eval:
+      <<: *eval
+      fp16: true
+
+dgxa100_8gpu_tf32: &dgxa100_8gpu_tf32
+   train:
+      <<: *train
+   eval:
+      <<: *eval
+
+dgxa100_4gpu_fp16: &dgxa100_4gpu_fp16
+   train:
+      <<: *train
+      fp16: true
+      batch_chunk: 2
+   eval:
+      <<: *eval
+      fp16: true
+
+dgxa100_4gpu_tf32: &dgxa100_4gpu_tf32
+   train:
+      <<: *train
+      batch_chunk: 2
+   eval:
+      <<: *eval
+
+dgxa100_2gpu_fp16: &dgxa100_2gpu_fp16
+   train:
+      <<: *train
+      fp16: true
+      batch_chunk: 4
+   eval:
+      <<: *eval
+      fp16: true
+
+dgxa100_2gpu_tf32: &dgxa100_2gpu_tf32
+   train:
+      <<: *train
+      batch_chunk: 4
+   eval:
+      <<: *eval
+
+dgxa100_1gpu_fp16: &dgxa100_1gpu_fp16
+   train:
+      <<: *train
+      fp16: true
+      batch_chunk: 8
+   eval:
+      <<: *eval
+      fp16: true
+
+dgxa100_1gpu_tf32: &dgxa100_1gpu_tf32
+   train:
+      <<: *train
+      batch_chunk: 8
+   eval:
+      <<: *eval
 
 # Training benchmarks
 trainbench: &trainbench

+ 67 - 4
PyTorch/LanguageModeling/Transformer-XL/pytorch/wt103_large.yaml

@@ -56,7 +56,7 @@ default:
       <<: *eval
 
 
-# Full training configs for DGX-1 (8x V100 16G)
+# Full training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
 dgx1_8gpu_fp16: &dgx1_8gpu_fp16
    train:
       <<: *train
@@ -122,7 +122,7 @@ dgx1_1gpu_fp32: &dgx1_1gpu_fp32
       <<: *eval
 
 
-# Full training configs for DGX-2 (16x V100 32G)
+# Full training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
 dgx2_16gpu_fp16: &dgx2_16gpu_fp16
    train:
       <<: *train
@@ -201,7 +201,71 @@ dgx2_1gpu_fp32: &dgx2_1gpu_fp32
    eval:
       <<: *eval
 
-# Full training configs for multi-node DGX-2 (16x V100 32G)
+# Full training configs for NVIDIA DGX A100 (8x NVIDIA A100 40GB GPU)
+dgxa100_8gpu_fp16: &dgxa100_8gpu_fp16
+   train:
+      <<: *train
+      fp16: true
+   eval:
+      <<: *eval
+      fp16: true
+
+dgxa100_8gpu_tf32: &dgxa100_8gpu_tf32
+   train:
+      <<: *train
+      batch_chunk: 2
+   eval:
+      <<: *eval
+
+dgxa100_4gpu_fp16: &dgxa100_4gpu_fp16
+   train:
+      <<: *train
+      fp16: true
+      batch_chunk: 2
+   eval:
+      <<: *eval
+      fp16: true
+
+dgxa100_4gpu_tf32: &dgxa100_4gpu_tf32
+   train:
+      <<: *train
+      batch_chunk: 4
+   eval:
+      <<: *eval
+
+dgxa100_2gpu_fp16: &dgxa100_2gpu_fp16
+   train:
+      <<: *train
+      fp16: true
+      batch_chunk: 4
+   eval:
+      <<: *eval
+      fp16: true
+
+dgxa100_2gpu_tf32: &dgxa100_2gpu_tf32
+   train:
+      <<: *train
+      batch_chunk: 8
+   eval:
+      <<: *eval
+
+dgxa100_1gpu_fp16: &dgxa100_1gpu_fp16
+   train:
+      <<: *train
+      fp16: true
+      batch_chunk: 8
+   eval:
+      <<: *eval
+      fp16: true
+
+dgxa100_1gpu_tf32: &dgxa100_1gpu_tf32
+   train:
+      <<: *train
+      batch_chunk: 16
+   eval:
+      <<: *eval
+
+# Full training configs for multi-node NVIDIA DGX-2 (16x NVIDIA V100 32GB GPU)
 8dgx2_16gpu_fp16: &8dgx2_16gpu_fp16
    train:
       <<: *train_multinode
@@ -269,7 +333,6 @@ dgx2_1gpu_fp32: &dgx2_1gpu_fp32
       <<: *eval
       batch_size: 16
 
-
 # Training benchmarks
 trainbench: &trainbench
    train:

+ 1 - 0
PyTorch/LanguageModeling/Transformer-XL/pytorch/requirements.txt → PyTorch/LanguageModeling/Transformer-XL/requirements.txt

@@ -1,2 +1,3 @@
 pytorch-transformers==1.1.0
 sacremoses==0.0.35
+nvidia-ml-py3==7.352.0

Некоторые файлы не были показаны из-за большого количества измененных файлов