6 år sedan · 1def26d80c
--- a/PyTorch/Recommendation/NCF/.gitmodules
+++ b/PyTorch/Recommendation/NCF/.gitmodules
--- a/PyTorch/Recommendation/NCF/README.md
+++ b/PyTorch/Recommendation/NCF/README.md
@@ -214,7 +214,7 @@ After the Docker container is launched, the training with the default hyperparam
 
															 ```bash
														
 
															 ./prepare_dataset.sh
														
 
															-python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m
														
 
															+python -m torch.distributed.launch --nproc_per_node=8 --use_env ncf.py --data /data/cache/ml-20m
														
 
															 ```
														
 
															 This will result in a checkpoint file being written to `/data/checkpoints/model.pth`.
														
@@ -225,7 +225,7 @@ This will result in a checkpoint file being written to `/data/checkpoints/model.
 
															 The trained model can be evaluated by passing the `--mode` test flag to the `run.sh` script:
														
 
															 ```bash
														
 
															-python -m torch.distributed.launch --nproc_per_node=1 ncf.py --data /data/cache/ml-20m  --mode test --load_checkpoint_path /data/checkpoints/model.pth
														
 
															+python -m torch.distributed.launch --nproc_per_node=1 --use_env ncf.py --data /data/cache/ml-20m  --mode test --load_checkpoint_path /data/checkpoints/model.pth
														
 
															 ```
														
@@ -330,13 +330,13 @@ For a smaller dataset you might experience slower performance.
 
															 To download, preprocess and train on the ML-1m dataset run:
														
 
															 ```bash
														
 
															 ./prepare_dataset.sh ml-1m
														
 
															-python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-1m
														
 
															+python -m torch.distributed.launch --nproc_per_node=8 --use_env ncf.py --data /data/cache/ml-1m
														
 
															 ```
														
 
															 ### Training process
														
 
															 The name of the training script is `ncf.py`. Because of the multi-GPU support, it should always be run with the torch distributed launcher like this:
														
 
															 ```bash
														
 
															-python -m torch.distributed.launch --nproc_per_node=<number_of_gpus> ncf.py --data <path_to_dataset> [other_parameters]
														
 
															+python -m torch.distributed.launch --nproc_per_node=<number_of_gpus> --use_env ncf.py --data <path_to_dataset> [other_parameters]
														
 
															 ```
														
 
															 The main result of the training are checkpoints stored by default in `/data/checkpoints/`. This location can be controlled
														
@@ -351,7 +351,7 @@ The HR@10 metric is the number of hits in the entire test set divided by the num
 
															 Inference can be launched with the same script used for training by passing the `--mode test` flag:
														
 
															 ```bash
														
 
															-python -m torch.distributed.launch --nproc_per_node=<number_of_gpus> ncf.py  --data <path_to_dataset> --mode test [other_parameters]
														
 
															+python -m torch.distributed.launch --nproc_per_node=<number_of_gpus> --use_env ncf.py  --data <path_to_dataset> --mode test [other_parameters]
														
 
															 ```
														
 
															 The script will then:
														
@@ -368,7 +368,7 @@ The script will then:
 
															 NCF training on NVIDIA DGX systems is very fast, therefore, in order to measure train and validation throughput, you can simply run the full training job with: 
														
 
															 ```bash
														
 
															 ./prepare_dataset.sh
														
 
															-python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --epochs 5
														
 
															+python -m torch.distributed.launch --nproc_per_node=8 --use_env ncf.py --data /data/cache/ml-20m --epochs 5
														
 
															 ```
														
 
															 At the end of the script, a line reporting the best train throughput is printed.
														
@@ -379,7 +379,7 @@ At the end of the script, a line reporting the best train throughput is printed.
 
															 Validation throughput can be measured by running the full training job with:
														
 
															 ```bash
														
 
															 ./prepare_dataset.sh
														
 
															-python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --epochs 5
														
 
															+python -m torch.distributed.launch --nproc_per_node=8 --use_env ncf.py --data /data/cache/ml-20m --epochs 5
														
 
															 ```
														
 
															 The best validation throughput is reported to the standard output. 
														
@@ -405,7 +405,7 @@ The training time was measured excluding data downloading, preprocessing, valida
 
															 To reproduce this result, start the NCF Docker container interactively and run:
														
 
															 ```bash
														
 
															 ./prepare_dataset.sh
														
 
															-python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m
														
 
															+python -m torch.distributed.launch --nproc_per_node=8 --use_env ncf.py --data /data/cache/ml-20m
														
 
															 ```
														
 
															 ##### NVIDIA DGX-1 (8x V100 32G)
														
@@ -428,7 +428,7 @@ Here's an example validation accuracy curve for mixed precision vs single precis
 
															 To reproduce this result, start the NCF Docker container interactively and run:
														
 
															 ```bash
														
 
															 ./prepare_dataset.sh
														
 
															-python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m
														
 
															+python -m torch.distributed.launch --nproc_per_node=8 --use_env ncf.py --data /data/cache/ml-20m
														
 
															 ```
														
 
															 ##### NVIDIA DGX-2 (16x V100 32G)
														
@@ -449,7 +449,7 @@ The training time was measured excluding data downloading, preprocessing, valida
 
															 To reproduce this result, start the NCF Docker container interactively and run:
														
 
															 ```bash
														
 
															 ./prepare_dataset.sh
														
 
															-python -m torch.distributed.launch --nproc_per_node=16 ncf.py --data /data/cache/ml-20m
														
 
															+python -m torch.distributed.launch --nproc_per_node=16 --use_env ncf.py --data /data/cache/ml-20m
														
 
															 ```
														
@@ -555,7 +555,8 @@ The following table shows the best inference throughput:
 
															 4. September, 2019
														
 
															     * Adjusting for API changes in PyTorch and APEX
														
 
															     * Checkpoints loading fix
														
 
															-
														
 
															+5. January, 2020
														
 
															+   * DLLogger support added
														
 
															 ### Known issues
														
--- a/PyTorch/Recommendation/NCF/convert.py
+++ b/PyTorch/Recommendation/NCF/convert.py
@@ -34,15 +34,10 @@ from load import implicit_load
 
															 import torch
														
 
															 import tqdm
														
 
															-from logger.logger import LOGGER
														
 
															-from logger import tags
														
 
															-
														
 
															 MIN_RATINGS = 20
														
 
															 USER_COLUMN = 'user_id'
														
 
															 ITEM_COLUMN = 'item_id'
														
 
															-LOGGER.model = 'ncf'
														
 
															-
														
 
															 def parse_args():
														
 
															     parser = ArgumentParser()
														
 
															     parser.add_argument('--path', type=str, default='/data/ml-20m/ratings.csv',
														
@@ -98,7 +93,6 @@ def main():
 
															     print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
														
 
															     grouped = df.groupby(USER_COLUMN)
														
 
															-    LOGGER.log(key=tags.PREPROC_HP_MIN_RATINGS, value=MIN_RATINGS)
														
 
															     df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)
														
 
															     print("Mapping original user and item IDs to new sequential IDs")
														
--- a/PyTorch/Recommendation/NCF/inference.py
+++ b/PyTorch/Recommendation/NCF/inference.py
@@ -17,17 +17,14 @@
 
															 import torch.jit
														
 
															 import time
														
 
															 from argparse import ArgumentParser
														
 
															-
														
 
															+import numpy as np
														
 
															 import torch
														
 
															 from neumf import NeuMF
														
 
															-from logger.logger import LOGGER, timed_block, timed_function
														
 
															-from logger.autologging import log_hardware, log_args
														
 
															-
														
 
															 from apex import amp
														
 
															-LOGGER.model = 'ncf'
														
 
															+import dllogger
														
 
															 def parse_args():
														
@@ -51,14 +48,19 @@ def parse_args():
 
															     parser.add_argument('--opt_level', default='O2', type=str,
														
 
															                         help='Optimization level for Automatic Mixed Precision',
														
 
															                         choices=['O0', 'O2'])
														
 
															+    parser.add_argument('--log_path', default='log.json', type=str,
														
 
															+                        help='Path for the JSON training log')
														
 
															     return parser.parse_args()
														
 
															 def main():
														
 
															-    log_hardware()
														
 
															     args = parse_args()
														
 
															-    log_args(args)
														
 
															+    dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
														
 
															+                                                       filename=args.log_path),
														
 
															+                            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
														
 
															+
														
 
															+    dllogger.log(data=vars(args), step='PARAMETER')
														
 
															     model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors,
														
 
															                   mlp_layer_sizes=args.layers, dropout=args.dropout)
														
@@ -85,10 +87,14 @@ def main():
 
															         torch.cuda.synchronize()
														
 
															         latencies.append(time.time() - start)
														
 
															-    LOGGER.log(key='batch_size', value=args.batch_size)
														
 
															-    LOGGER.log(key='best_inference_throughput', value=args.batch_size / min(latencies))
														
 
															-    LOGGER.log(key='best_inference_latency', value=min(latencies))
														
 
															-    LOGGER.log(key='inference_latencies', value=latencies)
														
 
															+    dllogger.log(data={'batch_size': args.batch_size,
														
 
															+                   'best_inference_throughput': args.batch_size / min(latencies),
														
 
															+                   'best_inference_latency': min(latencies),
														
 
															+                   'mean_inference_throughput': args.batch_size / np.mean(latencies),
														
 
															+                   'mean_inference_latency': np.mean(latencies),
														
 
															+                   'inference_latencies': latencies},
														
 
															+                 step=tuple())
														
 
															+    dllogger.flush()
														
 
															     return
														
--- a/PyTorch/Recommendation/NCF/logger/analyzer.py
+++ b/PyTorch/Recommendation/NCF/logger/analyzer.py
@@ -1,125 +0,0 @@
 
															-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
														
 
															-#
														
 
															-# Licensed under the Apache License, Version 2.0 (the "License");
														
 
															-# you may not use this file except in compliance with the License.
														
 
															-# You may obtain a copy of the License at
														
 
															-#
														
 
															-#     http://www.apache.org/licenses/LICENSE-2.0
														
 
															-#
														
 
															-# Unless required by applicable law or agreed to in writing, software
														
 
															-# distributed under the License is distributed on an "AS IS" BASIS,
														
 
															-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															-# See the License for the specific language governing permissions and
														
 
															-# limitations under the License.
														
 
															-
														
 
															-import sys
														
 
															-from collections import defaultdict
														
 
															-import json
														
 
															-
														
 
															-from logger import logger as nvl
														
 
															-from logger.parser import NVLogParser
														
 
															-from logger import tags
														
 
															-
														
 
															-
														
 
															-def collect_by_scope(loglines):
														
 
															-
														
 
															-    # dict to gather run scope results
														
 
															-    run_stats = dict()
														
 
															-    epoch_stats = dict()
														
 
															-    iteration_stats = dict()
														
 
															-
														
 
															-    # gather all lines with run_scope events & variables
														
 
															-    run_events = dict((l.tag, l) for l in loglines if l.scope == nvl.RUN_SCOPE)
														
 
															-
														
 
															-    # gather all variable tags
														
 
															-    run_variables = dict(k for k in run_events.items() if k[1].value is not None)
														
 
															-
														
 
															-    # find all time block names
														
 
															-    timed_blocks = [k[:-6] for k in run_events if k.endswith('_start')]
														
 
															-
														
 
															-    # measure times for the run scope
														
 
															-    for prefix in timed_blocks:
														
 
															-        # only when both start & stop are found
														
 
															-        if prefix + "_start" in run_events and prefix + "_stop" in run_events:
														
 
															-            start = run_events[prefix + "_start"].timestamp
														
 
															-            stop = run_events[prefix + "_stop"].timestamp
														
 
															-            run_stats[prefix + "_time"] = stop - start
														
 
															-
														
 
															-    # collect all variables - even nested
														
 
															-    for k in run_variables:
														
 
															-        e = run_events[k]
														
 
															-        if isinstance(e.value, dict):
														
 
															-            for d in e.value.keys():
														
 
															-                run_stats[k + "_" + d] = e.value[d]
														
 
															-        else:
														
 
															-            run_stats[k] = e.value
														
 
															-
														
 
															-    # find epochs
														
 
															-    epochs = sorted(list({int(l.epoch) for l in loglines if int(l.epoch) >= 0}))
														
 
															-    epoch_stats['x'] = epochs
														
 
															-
														
 
															-    # gather eval_accuracy
														
 
															-    eval_accuracy_dup = [l.value for l in loglines if l.tag == tags.EVAL_ACCURACY]
														
 
															-    eval_accuracy = [l['value'] for l in eval_accuracy_dup]
														
 
															-    epoch_stats['eval_accuracy'] = eval_accuracy
														
 
															-
														
 
															-    # gather it_per_sec
														
 
															-    eval_it_per_sec = [l.value for l in loglines if l.tag == tags.PERF_IT_PER_SEC]
														
 
															-    epoch_stats['it_per_sec'] = eval_it_per_sec
														
 
															-
														
 
															-
														
 
															-    # gather all epoch-iter tuples
														
 
															-    all_iterations = {(int(l.epoch), int(l.iteration)) for l in loglines if int(l.iteration) >= 0}
														
 
															-
														
 
															-    # group by epoch
														
 
															-    collected_iterations = defaultdict(list)
														
 
															-    for el in all_iterations:
														
 
															-        collected_iterations[el[0]].append(el[1])
														
 
															-
														
 
															-    # convert to list of lists
														
 
															-    iterations = [sorted(collected_iterations[k]) for k in sorted(collected_iterations.keys())]
														
 
															-    iteration_stats['x'] = iterations
														
 
															-
														
 
															-    # gather all epoch-iter-loss triples
														
 
															-    all_loss_dicts = [l.value for l in loglines if l.tag == tags.TRAIN_ITERATION_LOSS]
														
 
															-    all_loss = {(l['epoch'], l['iteration'], l['value']) for l in all_loss_dicts}
														
 
															-
														
 
															-    # group by epoch
														
 
															-    collected_loss = defaultdict(list)
														
 
															-    for el in all_loss:
														
 
															-        collected_loss[el[0]].append(el[2])
														
 
															-
														
 
															-    # convert to list of lists
														
 
															-    iterations_loss = [sorted(collected_loss[k]) for k in sorted(collected_loss.keys())]
														
 
															-    iteration_stats['loss'] = iterations_loss
														
 
															-
														
 
															-    # find epoch events and variables
														
 
															-    epoch_events = [l for l in loglines if l.scope == nvl.EPOCH_SCOPE]
														
 
															-    epoch_event_names = {l.tag for l in epoch_events}
														
 
															-    epoch_timed_blocks = {k[:-6] for k in epoch_event_names if k.endswith('_start')}
														
 
															-    epoch_variables = {l.tag for l in epoch_events if l.value is not None}
														
 
															-
														
 
															-    return {"run" : run_stats, "epoch": epoch_stats, "iter" : iteration_stats}
														
 
															-
														
 
															-
														
 
															-def analyze(input_path, output_path=None):
														
 
															-    parser = NVLogParser()
														
 
															-    loglines, errors, worker_loglines = parser.parse_file(input_path)
														
 
															-
														
 
															-    stats = collect_by_scope(worker_loglines['(0)'])
														
 
															-
														
 
															-    if not output_path:
														
 
															-        print(json.dumps(stats, indent=4))
														
 
															-    else:
														
 
															-        with open(output_path, 'w') as f:
														
 
															-            json.dump(obj=stats, fp=f, indent=4)
														
 
															-
														
 
															-
														
 
															-if __name__ == '__main__':
														
 
															-    if len(sys.argv) != 2:
														
 
															-        print('usage: analyzer.py FILENAME')
														
 
															-        print('       tests analyzing on the file.')
														
 
															-        sys.exit(1)
														
 
															-
														
 
															-    analyze(input_path=sys.argv[1], output_path=None)
														
 
															-
														
--- a/PyTorch/Recommendation/NCF/logger/autologging.py
+++ b/PyTorch/Recommendation/NCF/logger/autologging.py
@@ -1,61 +0,0 @@
 
															-#
														
 
															-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
														
 
															-#
														
 
															-# Licensed under the Apache License, Version 2.0 (the "License");
														
 
															-# you may not use this file except in compliance with the License.
														
 
															-# You may obtain a copy of the License at
														
 
															-#
														
 
															-#     http://www.apache.org/licenses/LICENSE-2.0
														
 
															-#
														
 
															-# Unless required by applicable law or agreed to in writing, software
														
 
															-# distributed under the License is distributed on an "AS IS" BASIS,
														
 
															-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															-# See the License for the specific language governing permissions and
														
 
															-# limitations under the License.
														
 
															-
														
 
															-
														
 
															-import subprocess
														
 
															-import xml.etree.ElementTree as ET
														
 
															-
														
 
															-from logger.logger import LOGGER
														
 
															-
														
 
															-
														
 
															-def log_hardware():
														
 
															-    # number of CPU threads
														
 
															-    cpu_info_command = 'cat /proc/cpuinfo'
														
 
															-    cpu_info = subprocess.run(cpu_info_command.split(), stdout=subprocess.PIPE).stdout.split()
														
 
															-    cpu_num_index = len(cpu_info) - cpu_info[::-1].index(b'processor') + 1
														
 
															-    cpu_num = int(cpu_info[cpu_num_index]) + 1
														
 
															-
														
 
															-    # CPU name
														
 
															-    cpu_name_begin_index = cpu_info.index(b'name')
														
 
															-    cpu_name_end_index = cpu_info.index(b'stepping')
														
 
															-    cpu_name = b' '.join(cpu_info[cpu_name_begin_index + 2:cpu_name_end_index]).decode('utf-8')
														
 
															-
														
 
															-    LOGGER.log(key='cpu_info', value={"num": cpu_num, "name": cpu_name}, stack_offset=1)
														
 
															-
														
 
															-    # RAM memory
														
 
															-    ram_info_command = 'free -m -h'
														
 
															-    ram_info = subprocess.run(ram_info_command.split(), stdout=subprocess.PIPE).stdout.split()
														
 
															-    ram_index = ram_info.index(b'Mem:') + 1
														
 
															-    ram = ram_info[ram_index].decode('utf-8')
														
 
															-
														
 
															-    LOGGER.log(key='mem_info', value={"ram": ram}, stack_offset=1)
														
 
															-
														
 
															-    # GPU
														
 
															-    nvidia_smi_command = 'nvidia-smi -q -x'
														
 
															-    nvidia_smi_output = subprocess.run(nvidia_smi_command.split(), stdout=subprocess.PIPE).stdout
														
 
															-    nvidia_smi = ET.fromstring(nvidia_smi_output)
														
 
															-    gpus = nvidia_smi.findall('gpu')
														
 
															-    ver = nvidia_smi.findall('driver_version')
														
 
															-
														
 
															-    LOGGER.log(key="gpu_info",
														
 
															-                 stack_offset=1,
														
 
															-                 value={
														
 
															-                      "driver_version": ver[0].text,
														
 
															-                      "num": len(gpus),
														
 
															-                      "name": [g.find('product_name').text for g in gpus],
														
 
															-                      "mem": [g.find('fb_memory_usage').find('total').text for g in gpus]})
														
 
															-
														
 
															-def log_args(args):
														
 
															-    LOGGER.log(key='args', value=vars(args), stack_offset=1)
														
--- a/PyTorch/Recommendation/NCF/logger/logger.py
+++ b/PyTorch/Recommendation/NCF/logger/logger.py
@@ -1,194 +0,0 @@
 
															-# Copyright 2018 MLBenchmark Group. All Rights Reserved.
														
 
															-#
														
 
															-# Licensed under the Apache License, Version 2.0 (the "License");
														
 
															-# you may not use this file except in compliance with the License.
														
 
															-# You may obtain a copy of the License at
														
 
															-#
														
 
															-#     http://www.apache.org/licenses/LICENSE-2.0
														
 
															-#
														
 
															-# Unless required by applicable law or agreed to in writing, software
														
 
															-# distributed under the License is distributed on an "AS IS" BASIS,
														
 
															-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															-# See the License for the specific language governing permissions and
														
 
															-# limitations under the License.
														
 
															-# ==============================================================================
														
 
															-#
														
 
															-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
														
 
															-#
														
 
															-# Licensed under the Apache License, Version 2.0 (the "License");
														
 
															-# you may not use this file except in compliance with the License.
														
 
															-# You may obtain a copy of the License at
														
 
															-#
														
 
															-#     http://www.apache.org/licenses/LICENSE-2.0
														
 
															-#
														
 
															-# Unless required by applicable law or agreed to in writing, software
														
 
															-# distributed under the License is distributed on an "AS IS" BASIS,
														
 
															-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															-# See the License for the specific language governing permissions and
														
 
															-# limitations under the License.
														
 
															-
														
 
															-import time
														
 
															-import json
														
 
															-import logging
														
 
															-import os
														
 
															-import inspect
														
 
															-import sys
														
 
															-import re
														
 
															-from contextlib import contextmanager
														
 
															-import functools
														
 
															-
														
 
															-
														
 
															-NVLOGGER_VERSION='0.1.0'
														
 
															-NVLOGGER_TOKEN= ':::NVLOG'
														
 
															-NVLOGGER_NAME="nv_dl_logger"
														
 
															-NVLOGGER_FILE_NAME="nv_dl_logger"
														
 
															-
														
 
															-RUN_SCOPE = 0
														
 
															-EPOCH_SCOPE = 1
														
 
															-TRAIN_ITER_SCOPE = 2
														
 
															-EVAL_ITER_SCOPE = 3
														
 
															-
														
 
															-LOGGING_SCOPE = {
														
 
															-    RUN_SCOPE,
														
 
															-    EPOCH_SCOPE,
														
 
															-    TRAIN_ITER_SCOPE,
														
 
															-    EVAL_ITER_SCOPE
														
 
															-}
														
 
															-
														
 
															-
														
 
															-def get_caller(stack_index=2, root_dir=None):
														
 
															-    caller = inspect.getframeinfo(inspect.stack()[stack_index][0])
														
 
															-
														
 
															-    # Trim the file names for readability.
														
 
															-    filename = caller.filename
														
 
															-    if root_dir is not None:
														
 
															-        filename = re.sub("^" + root_dir + "/", "", filename)
														
 
															-    return "%s:%d" % (filename, caller.lineno)
														
 
															-
														
 
															-
														
 
															-class NVLogger(object):
														
 
															-    __instance = None
														
 
															-    token = NVLOGGER_TOKEN
														
 
															-    version = NVLOGGER_VERSION
														
 
															-    stack_offset = 0
														
 
															-    extra_print = False
														
 
															-    model = "NN"
														
 
															-    root_dir = None
														
 
															-    worker = [0]
														
 
															-    prefix = ''
														
 
															-    log_file = None
														
 
															-    file_handler = None
														
 
															-
														
 
															-    @staticmethod
														
 
															-    def get_instance():
														
 
															-        if NVLogger.__instance is None:
														
 
															-            NVLogger()
														
 
															-
														
 
															-        return NVLogger.__instance
														
 
															-
														
 
															-    def set_worker(self, worker):
														
 
															-        if worker is None:
														
 
															-            self.prefix = ''
														
 
															-            self.worker = [0]
														
 
															-        else:
														
 
															-            self.prefix = json.dumps(worker)
														
 
															-            self.worker = list(worker)
														
 
															-
														
 
															-    def set_file(self, file_name=None):
														
 
															-
														
 
															-        if file_name is None:
														
 
															-            self.log_file = os.getenv(NVLOGGER_FILE_NAME)
														
 
															-        else:
														
 
															-            self.log_file = file_name
														
 
															-
														
 
															-        if self.log_file:
														
 
															-            self.file_handler = logging.FileHandler(self.log_file)
														
 
															-            self.file_handler.setLevel(logging.DEBUG)
														
 
															-            self.logger.addHandler(self.file_handler)
														
 
															-            self.stream_handler.setLevel(logging.INFO)
														
 
															-        else:
														
 
															-            self.stream_handler.setLevel(logging.DEBUG)
														
 
															-
														
 
															-    def __init__(self):
														
 
															-
														
 
															-        if NVLogger.__instance is None:
														
 
															-            NVLogger.__instance = self
														
 
															-        else:
														
 
															-            raise Exception("This class is a singleton!")
														
 
															-
														
 
															-        self.logger = logging.getLogger(NVLOGGER_NAME)
														
 
															-        self.logger.setLevel(logging.DEBUG)
														
 
															-
														
 
															-        self.stream_handler = logging.StreamHandler(stream=sys.stdout)
														
 
															-        self.stream_handler.setLevel(logging.DEBUG)
														
 
															-        self.logger.addHandler(self.stream_handler)
														
 
															-
														
 
															-    def print_vars(self, variables, forced=False, stack_offset=0):
														
 
															-        if isinstance(variables, dict):
														
 
															-            for v in variables.keys():
														
 
															-                self.log(key=v, value=variables[v], forced=forced, stack_offset=stack_offset+1)
														
 
															-
														
 
															-    def print_vars2(self, key, variables, forced=False, stack_offset=0):
														
 
															-        if isinstance(variables, dict):
														
 
															-            self.log(key=key, value=variables, forced=forced, stack_offset=stack_offset+1)
														
 
															-
														
 
															-    def log(self, key, value=None, forced=False, stack_offset=0):
														
 
															-
														
 
															-        # only the 0-worker will log
														
 
															-        if not forced and self.worker != 0:
														
 
															-            pass
														
 
															-
														
 
															-        if value is None:
														
 
															-            msg = key
														
 
															-        else:
														
 
															-            str_json = json.dumps(value)
														
 
															-            msg = '{key}: {value}'.format(key=key, value=str_json)
														
 
															-
														
 
															-        call_site = get_caller(2 + self.stack_offset + stack_offset, root_dir=self.root_dir)
														
 
															-        now = time.time()
														
 
															-
														
 
															-        message = '{prefix}{token}v{ver} {model} {secs:.9f} ({call_site}) {msg}'.format(
														
 
															-            prefix=self.prefix, token=self.token, ver=self.version, secs=now, model=self.model,
														
 
															-            call_site=call_site, msg=msg)
														
 
															-
														
 
															-        if self.extra_print:
														
 
															-            print()
														
 
															-
														
 
															-        self.logger.debug(message)
														
 
															-
														
 
															-
														
 
															-LOGGER = NVLogger.get_instance()
														
 
															-
														
 
															-@contextmanager
														
 
															-def timed_block(prefix, value=None, logger=LOGGER, forced=False, stack_offset=2):
														
 
															-    """ This function helps with timed blocks
														
 
															-        ----
														
 
															-        Parameters:
														
 
															-        prefix - one of items from TIMED_BLOCKS; the action to be timed
														
 
															-        logger - NVLogger object
														
 
															-        forced - if True then the events are always logged (even if it should be skipped)
														
 
															-    """
														
 
															-    if logger is None:
														
 
															-        pass
														
 
															-    logger.log(key=prefix + "_start", value=value, forced=forced, stack_offset=stack_offset)
														
 
															-    yield logger
														
 
															-    logger.log(key=prefix + "_stop", forced=forced, stack_offset=stack_offset)
														
 
															-
														
 
															-
														
 
															-def timed_function(prefix, variable=None, forced=False):
														
 
															-    """ This decorator helps with timed functions
														
 
															-        ----
														
 
															-        Parameters:
														
 
															-        prefix - one of items from TIME_BLOCK; the action to be timed
														
 
															-        logger - NVLogger object
														
 
															-        forced - if True then the events are always logged (even if it should be skipped)
														
 
															-    """
														
 
															-    def timed_function_decorator(func):
														
 
															-        @functools.wraps(func)
														
 
															-        def wrapper(*args, **kwargs):
														
 
															-            logger = kwargs.get('logger', LOGGER)
														
 
															-            value = kwargs.get(variable, next(iter(args), None))
														
 
															-            with timed_block(prefix=prefix, logger=logger, value=value, forced=forced, stack_offset=3):
														
 
															-                    func(*args, **kwargs)
														
 
															-        return wrapper
														
 
															-    return timed_function_decorator
														
--- a/PyTorch/Recommendation/NCF/logger/parser.py
+++ b/PyTorch/Recommendation/NCF/logger/parser.py
@@ -1,224 +0,0 @@
 
															-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
														
 
															-#
														
 
															-# Licensed under the Apache License, Version 2.0 (the "License");
														
 
															-# you may not use this file except in compliance with the License.
														
 
															-# You may obtain a copy of the License at
														
 
															-#
														
 
															-#     http://www.apache.org/licenses/LICENSE-2.0
														
 
															-#
														
 
															-# Unless required by applicable law or agreed to in writing, software
														
 
															-# distributed under the License is distributed on an "AS IS" BASIS,
														
 
															-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															-# See the License for the specific language governing permissions and
														
 
															-# limitations under the License.
														
 
															-
														
 
															-from __future__ import print_function
														
 
															-
														
 
															-import collections
														
 
															-import json
														
 
															-import re
														
 
															-import sys
														
 
															-from collections import defaultdict
														
 
															-
														
 
															-from logger import tags
														
 
															-import logger.logger as nvl
														
 
															-
														
 
															-
														
 
															-LogLine = collections.namedtuple('LogLine', [
														
 
															-    'full_string',  # the complete line as a string
														
 
															-    'worker',       # the worker id
														
 
															-    'token',        # the token, i.e. ':::NVLOG'
														
 
															-    'version_str',  # the version string, e.g. 'v0.1.0'
														
 
															-    'model',        # the model, e.g. 'ncf'
														
 
															-    'timestamp',    # seconds as a float, e.g. 1234.567
														
 
															-    'filename',     # the which generated the log line, e.g. './convert.py'
														
 
															-    'lineno',       # the line in the file which generated the log line, e.g. 119
														
 
															-    'tag',          # the string tag
														
 
															-    'value',        # the parsed value associated with the tag, or None if no value
														
 
															-    'epoch',        # the epoch number of -1 if none
														
 
															-    'iteration',    # the interation number of -1 if none
														
 
															-    'scope'         # run, epoch, iteration, eval_iteration
														
 
															-])
														
 
															-
														
 
															-
														
 
															-def get_dict_value(x):
														
 
															-    if isinstance(x, dict):
														
 
															-        return x
														
 
															-    return {"value": x}
														
 
															-
														
 
															-
														
 
															-def get_value(x):
														
 
															-    if isinstance(x, dict):
														
 
															-        if "value" in x:
														
 
															-            return x.get("value")
														
 
															-        else:
														
 
															-            return x
														
 
															-    return x
														
 
															-
														
 
															-
														
 
															-def get_named_value(x, name):
														
 
															-    if isinstance(x, dict):
														
 
															-        if name in x:
														
 
															-            return x.get(name)
														
 
															-        else:
														
 
															-            return None
														
 
															-    return x
														
 
															-
														
 
															-
														
 
															-class NVLogParser(object):
														
 
															-
														
 
															-    def __init__(self, token=nvl.NVLOGGER_TOKEN, version=nvl.NVLOGGER_VERSION):
														
 
															-
														
 
															-        self.epoch = defaultdict(lambda: -1)
														
 
															-        self.iteration = defaultdict(lambda: -1)
														
 
															-        self.scope = defaultdict(lambda: 0)
														
 
															-
														
 
															-        self.version = version
														
 
															-        self.token = token
														
 
															-        self.line_pattern = (
														
 
															-            '^'
														
 
															-            '([\d]?)'                    # optional worker id (0)
														
 
															-            '(' + token + ')'            # mandatory token (1)
														
 
															-            'v([\d]+\.[\d+]\.[\d+])[ ]'  # mandatory version (2)
														
 
															-            '([A-Za-z0-9_]+)[ ]'         # mandatory model (3)
														
 
															-            '([\d\.]+)[ ]'               # mandatory timestamp (4)
														
 
															-            '\(([^: ]+)'                 # mandatory file (5)
														
 
															-            ':(\d+)\)[ ]'                # mandatory lineno (6)
														
 
															-            '([A-Za-z0-9_]+)[ ]?'        # mandatory tag (7)
														
 
															-            '(:\s+(.+))?'                # optional value (8)
														
 
															-            '$'
														
 
															-        )
														
 
															-        # print(self.line_pattern)
														
 
															-
														
 
															-        self.line_regex = re.compile(self.line_pattern, re.X)
														
 
															-
														
 
															-    def string_to_logline(self, string):
														
 
															-
														
 
															-        m = self.line_regex.match(string)
														
 
															-
														
 
															-        if m is None:
														
 
															-            raise ValueError('does not match regex')
														
 
															-
														
 
															-        args = [
														
 
															-            m.group(0),  # full string
														
 
															-        ]
														
 
															-
														
 
															-        # by default
														
 
															-        worker = m.group(1)
														
 
															-        if worker == "":
														
 
															-            worker = "(0)"
														
 
															-
														
 
															-        args.append(worker)
														
 
															-
														
 
															-        args.append(m.group(2))  # token
														
 
															-        args.append(m.group(3))  # version
														
 
															-        args.append(m.group(4))  # model
														
 
															-
														
 
															-        try:
														
 
															-            ts = float(m.group(5))      # parse timestamp
														
 
															-            args.append(ts)
														
 
															-        except ValueError:
														
 
															-            raise ValueError('timestamp format incorrect')
														
 
															-
														
 
															-        args.append(m.group(6))         # file name
														
 
															-
														
 
															-        try:
														
 
															-            lineno = int(m.group(7))    # may raise error
														
 
															-            args.append(lineno)
														
 
															-        except ValueError:
														
 
															-            raise ValueError('line number format incorrect')
														
 
															-
														
 
															-        tag = m.group(8)
														
 
															-        args.append(tag)         # tag
														
 
															-
														
 
															-        # 9th is ignored
														
 
															-
														
 
															-        value = m.group(10)
														
 
															-
														
 
															-        if value is not None:
														
 
															-            j = json.loads(value)
														
 
															-            args.append(j)
														
 
															-        else:
														
 
															-            # no Value
														
 
															-            args.append(None)
														
 
															-
														
 
															-        # update processing state
														
 
															-        if tag == tags.TRAIN_EPOCH_START or tag == tags.TRAIN_EPOCH:
														
 
															-            self.epoch[worker] = get_named_value(value, tags.VALUE_EPOCH)
														
 
															-            self.scope[worker] = nvl.EPOCH_SCOPE
														
 
															-            self.iteration[worker] = -1
														
 
															-
														
 
															-        if tag == tags.TRAIN_EPOCH_STOP:
														
 
															-            self.scope[worker] = nvl.RUN_SCOPE
														
 
															-
														
 
															-        if tag == tags.TRAIN_ITER_START:
														
 
															-            self.iteration[worker] = get_named_value(value, tags.VALUE_ITERATION)
														
 
															-            self.scope[worker] = nvl.TRAIN_ITER_SCOPE
														
 
															-
														
 
															-        if tag == tags.TRAIN_ITER_STOP:
														
 
															-            self.scope[worker] = nvl.EPOCH_SCOPE
														
 
															-
														
 
															-        if tag == tags.PERF_IT_PER_SEC:
														
 
															-            self.scope[worker] = nvl.EPOCH_SCOPE
														
 
															-
														
 
															-        if tag == tags.PERF_TIME_TO_TRAIN:
														
 
															-            self.scope[worker] = nvl.RUN_SCOPE
														
 
															-
														
 
															-        args.append(self.epoch[worker])
														
 
															-        args.append(self.iteration[worker])
														
 
															-        args.append(self.scope[worker])
														
 
															-
														
 
															-        return LogLine(*args)
														
 
															-
														
 
															-    def parse_generator(self, gen):
														
 
															-        worker_loglines = defaultdict(list)
														
 
															-        loglines = []
														
 
															-        failed = []
														
 
															-
														
 
															-        # state init for parsing
														
 
															-        self.epoch.clear()
														
 
															-        self.iteration.clear()
														
 
															-        self.scope.clear()
														
 
															-
														
 
															-        for line in gen:
														
 
															-            line = line.strip()
														
 
															-            if line.find(self.token) == -1:
														
 
															-                continue
														
 
															-            try:
														
 
															-                ll = self.string_to_logline(line)
														
 
															-                worker_loglines[ll.worker].append(ll)
														
 
															-                loglines.append(ll)
														
 
															-            except ValueError as e:
														
 
															-                failed.append((line, str(e)))
														
 
															-
														
 
															-        return loglines, failed, worker_loglines
														
 
															-
														
 
															-    def parse_file(self, filename):
														
 
															-        with open(filename) as f:
														
 
															-            return self.parse_generator(f)
														
 
															-
														
 
															-
														
 
															-if __name__ == '__main__':
														
 
															-    if len(sys.argv) != 2:
														
 
															-        print('usage: parser.py FILENAME')
														
 
															-        print('       tests parsing on the file.')
														
 
															-        sys.exit(1)
														
 
															-
														
 
															-    filename = sys.argv[1]
														
 
															-    parser = NVLogParser()
														
 
															-    loglines, errors, worker_loglines = parser.parse_file(filename)
														
 
															-
														
 
															-    print('Parsed {} log lines with {} errors.'.format(len(loglines), len(errors)))
														
 
															-    print('Found workers: {}.'.format(list(worker_loglines.keys())))
														
 
															-
														
 
															-    if len(errors) > 0:
														
 
															-        print('Lines which failed to parse:')
														
 
															-        for line, error in errors:
														
 
															-            print('  Following line failed: {}'.format(error))
														
 
															-            print(line)
														
 
															-
														
 
															-    if len(loglines) > 0:
														
 
															-        print('Lines which where parsed sucessfully:')
														
 
															-        for line in loglines:
														
 
															-            print(line.full_string, " ---> ", line.epoch, line.iteration, line.scope)
														
 
															-
														
--- a/PyTorch/Recommendation/NCF/logger/tags.py
+++ b/PyTorch/Recommendation/NCF/logger/tags.py
@@ -1,256 +0,0 @@
 
															-# Copyright 2018 MLBenchmark Group. All Rights Reserved.
														
 
															-#
														
 
															-# Licensed under the Apache License, Version 2.0 (the "License");
														
 
															-# you may not use this file except in compliance with the License.
														
 
															-# You may obtain a copy of the License at
														
 
															-#
														
 
															-#     http://www.apache.org/licenses/LICENSE-2.0
														
 
															-#
														
 
															-# Unless required by applicable law or agreed to in writing, software
														
 
															-# distributed under the License is distributed on an "AS IS" BASIS,
														
 
															-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															-# See the License for the specific language governing permissions and
														
 
															-# limitations under the License.
														
 
															-# ==============================================================================
														
 
															-#
														
 
															-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
														
 
															-#
														
 
															-# Licensed under the Apache License, Version 2.0 (the "License");
														
 
															-# you may not use this file except in compliance with the License.
														
 
															-# You may obtain a copy of the License at
														
 
															-#
														
 
															-#     http://www.apache.org/licenses/LICENSE-2.0
														
 
															-#
														
 
															-# Unless required by applicable law or agreed to in writing, software
														
 
															-# distributed under the License is distributed on an "AS IS" BASIS,
														
 
															-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															-# See the License for the specific language governing permissions and
														
 
															-# limitations under the License.
														
 
															-
														
 
															-# Common values reported
														
 
															-
														
 
															-VALUE_EPOCH = "epoch"
														
 
															-VALUE_ITERATION = "iteration"
														
 
															-VALUE_ACCURACY = "accuracy"
														
 
															-VALUE_BLEU = "bleu"
														
 
															-VALUE_TOP1 = "top1"
														
 
															-VALUE_TOP5 = "top5"
														
 
															-VALUE_BBOX_MAP = "bbox_map"
														
 
															-VALUE_MASK_MAP = "mask_map"
														
 
															-VALUE_BCE = "binary_cross_entropy"
														
 
															-
														
 
															-
														
 
															-# Timed blocks (used with timed_function & timed_block
														
 
															-# For each there should be *_start and *_stop tags defined
														
 
															-
														
 
															-RUN_BLOCK = "run"
														
 
															-SETUP_BLOCK = "setup"
														
 
															-PREPROC_BLOCK = "preproc"
														
 
															-
														
 
															-TRAIN_BLOCK = "train"
														
 
															-TRAIN_PREPROC_BLOCK = "train_preproc"
														
 
															-TRAIN_EPOCH_BLOCK = "train_epoch"
														
 
															-TRAIN_EPOCH_PREPROC_BLOCK = "train_epoch_preproc"
														
 
															-TRAIN_CHECKPOINT_BLOCK = "train_checkpoint"
														
 
															-TRAIN_ITER_BLOCK = "train_iteration"
														
 
															-
														
 
															-EVAL_BLOCK = "eval"
														
 
															-EVAL_ITER_BLOCK = "eval_iteration"
														
 
															-
														
 
															-TIMED_BLOCKS = {
														
 
															-    RUN_BLOCK,
														
 
															-    SETUP_BLOCK,
														
 
															-    PREPROC_BLOCK,
														
 
															-    TRAIN_BLOCK,
														
 
															-    TRAIN_PREPROC_BLOCK,
														
 
															-    TRAIN_EPOCH_BLOCK,
														
 
															-    TRAIN_EPOCH_PREPROC_BLOCK,
														
 
															-    TRAIN_CHECKPOINT_BLOCK,
														
 
															-    TRAIN_ITER_BLOCK,
														
 
															-    EVAL_BLOCK,
														
 
															-    EVAL_ITER_BLOCK,
														
 
															-}
														
 
															-
														
 
															-
														
 
															-# Events
														
 
															-
														
 
															-RUN_INIT = "run_init"
														
 
															-
														
 
															-SETUP_START = "setup_start"
														
 
															-SETUP_STOP = "setup_stop"
														
 
															-
														
 
															-PREPROC_START = "preproc_start"
														
 
															-PREPROC_STOP = "preproc_stop"
														
 
															-
														
 
															-RUN_START = "run_start"
														
 
															-RUN_STOP = "run_stop"
														
 
															-RUN_FINAL = "run_final"
														
 
															-
														
 
															-TRAIN_CHECKPOINT_START = "train_checkpoint_start"
														
 
															-TRAIN_CHECKPOINT_STOP = "train_checkpoint_stop"
														
 
															-
														
 
															-TRAIN_PREPROC_START = "train_preproc_start"
														
 
															-TRAIN_PREPROC_STOP = "train_preproc_stop"
														
 
															-
														
 
															-TRAIN_EPOCH_PREPROC_START = "train_epoch_preproc_start"
														
 
															-TRAIN_EPOCH_PREPROC_STOP = "train_epoch_preproc_stop"
														
 
															-
														
 
															-TRAIN_ITER_START = "train_iter_start"
														
 
															-TRAIN_ITER_STOP = "train_iter_stop"
														
 
															-
														
 
															-TRAIN_EPOCH_START = "train_epoch_start"
														
 
															-TRAIN_EPOCH_STOP = "train_epoch_stop"
														
 
															-
														
 
															-
														
 
															-# MLPerf specific tags
														
 
															-
														
 
															-RUN_CLEAR_CACHES = "run_clear_caches"
														
 
															-
														
 
															-PREPROC_NUM_TRAIN_EXAMPLES = "preproc_num_train_examples"
														
 
															-PREPROC_NUM_EVAL_EXAMPLES = "preproc_num_eval_examples"
														
 
															-PREPROC_TOKENIZE_TRAINING = "preproc_tokenize_training"
														
 
															-PREPROC_TOKENIZE_EVAL = "preproc_tokenize_eval"
														
 
															-PREPROC_VOCAB_SIZE = "preproc_vocab_size"
														
 
															-
														
 
															-RUN_SET_RANDOM_SEED = "run_set_random_seed"
														
 
															-
														
 
															-INPUT_SIZE = "input_size"
														
 
															-INPUT_BATCH_SIZE = "input_batch_size"
														
 
															-INPUT_ORDER = "input_order"
														
 
															-INPUT_SHARD = "input_shard"
														
 
															-INPUT_BN_SPAN = "input_bn_span"
														
 
															-
														
 
															-INPUT_CENTRAL_CROP = "input_central_crop"
														
 
															-INPUT_CROP_USES_BBOXES = "input_crop_uses_bboxes"
														
 
															-INPUT_DISTORTED_CROP_MIN_OBJ_COV = "input_distorted_crop_min_object_covered"
														
 
															-INPUT_DISTORTED_CROP_RATIO_RANGE = "input_distorted_crop_aspect_ratio_range"
														
 
															-INPUT_DISTORTED_CROP_AREA_RANGE = "input_distorted_crop_area_range"
														
 
															-INPUT_DISTORTED_CROP_MAX_ATTEMPTS = "input_distorted_crop_max_attempts"
														
 
															-INPUT_MEAN_SUBTRACTION = "input_mean_subtraction"
														
 
															-INPUT_RANDOM_FLIP = "input_random_flip"
														
 
															-
														
 
															-INPUT_RESIZE = "input_resize"
														
 
															-INPUT_RESIZE_ASPECT_PRESERVING = "input_resize_aspect_preserving"
														
 
															-
														
 
															-
														
 
															-# Opt
														
 
															-
														
 
															-OPT_NAME = "opt_name"
														
 
															-
														
 
															-OPT_LR = "opt_learning_rate"
														
 
															-OPT_MOMENTUM = "opt_momentum"
														
 
															-
														
 
															-OPT_WEIGHT_DECAY = "opt_weight_decay"
														
 
															-
														
 
															-OPT_HP_ADAM_BETA1 = "opt_hp_Adam_beta1"
														
 
															-OPT_HP_ADAM_BETA2 = "opt_hp_Adam_beta2"
														
 
															-OPT_HP_ADAM_EPSILON = "opt_hp_Adam_epsilon"
														
 
															-
														
 
															-OPT_LR_WARMUP_STEPS = "opt_learning_rate_warmup_steps"
														
 
															-
														
 
															-
														
 
															-#  Train
														
 
															-
														
 
															-TRAIN_LOOP = "train_loop"
														
 
															-TRAIN_EPOCH = "train_epoch"
														
 
															-TRAIN_CHECKPOINT = "train_checkpoint"
														
 
															-TRAIN_LOSS = "train_loss"
														
 
															-TRAIN_ITERATION_LOSS = "train_iteration_loss"
														
 
															-
														
 
															-
														
 
															-# Eval
														
 
															-
														
 
															-EVAL_START = "eval_start"
														
 
															-EVAL_SIZE = "eval_size"
														
 
															-EVAL_TARGET = "eval_target"
														
 
															-EVAL_ACCURACY = "eval_accuracy"
														
 
															-EVAL_STOP = "eval_stop"
														
 
															-
														
 
															-
														
 
															-# Perf
														
 
															-
														
 
															-PERF_IT_PER_SEC = "perf_it_per_sec"
														
 
															-PERF_TIME_TO_TRAIN = "time_to_train"
														
 
															-
														
 
															-EVAL_ITERATION_ACCURACY = "eval_iteration_accuracy"
														
 
															-
														
 
															-
														
 
															-# Model
														
 
															-
														
 
															-MODEL_HP_LOSS_FN = "model_hp_loss_fn"
														
 
															-
														
 
															-MODEL_HP_INITIAL_SHAPE = "model_hp_initial_shape"
														
 
															-MODEL_HP_FINAL_SHAPE = "model_hp_final_shape"
														
 
															-
														
 
															-MODEL_L2_REGULARIZATION = "model_l2_regularization"
														
 
															-MODEL_EXCLUDE_BN_FROM_L2 = "model_exclude_bn_from_l2"
														
 
															-
														
 
															-MODEL_HP_RELU = "model_hp_relu"
														
 
															-MODEL_HP_CONV2D_FIXED_PADDING = "model_hp_conv2d_fixed_padding"
														
 
															-MODEL_HP_BATCH_NORM = "model_hp_batch_norm"
														
 
															-MODEL_HP_DENSE = "model_hp_dense"
														
 
															-
														
 
															-
														
 
															-# GNMT specific
														
 
															-
														
 
															-MODEL_HP_LOSS_SMOOTHING = "model_hp_loss_smoothing"
														
 
															-MODEL_HP_NUM_LAYERS = "model_hp_num_layers"
														
 
															-MODEL_HP_HIDDEN_SIZE = "model_hp_hidden_size"
														
 
															-MODEL_HP_DROPOUT = "model_hp_dropout"
														
 
															-
														
 
															-EVAL_HP_BEAM_SIZE = "eval_hp_beam_size"
														
 
															-TRAIN_HP_MAX_SEQ_LEN = "train_hp_max_sequence_length"
														
 
															-EVAL_HP_MAX_SEQ_LEN = "eval_hp_max_sequence_length"
														
 
															-EVAL_HP_LEN_NORM_CONST = "eval_hp_length_normalization_constant"
														
 
															-EVAL_HP_LEN_NORM_FACTOR = "eval_hp_length_normalization_factor"
														
 
															-EVAL_HP_COV_PENALTY_FACTOR = "eval_hp_coverage_penalty_factor"
														
 
															-
														
 
															-
														
 
															-# NCF specific
														
 
															-
														
 
															-PREPROC_HP_MIN_RATINGS = "preproc_hp_min_ratings"
														
 
															-PREPROC_HP_NUM_EVAL = "preproc_hp_num_eval"
														
 
															-PREPROC_HP_SAMPLE_EVAL_REPLACEMENT = "preproc_hp_sample_eval_replacement"
														
 
															-
														
 
															-INPUT_HP_NUM_NEG = "input_hp_num_neg"
														
 
															-INPUT_HP_SAMPLE_TRAIN_REPLACEMENT = "input_hp_sample_train_replacement"
														
 
															-INPUT_STEP_TRAIN_NEG_GEN = "input_step_train_neg_gen"
														
 
															-INPUT_STEP_EVAL_NEG_GEN = "input_step_eval_neg_gen"
														
 
															-
														
 
															-EVAL_HP_NUM_USERS = "eval_hp_num_users"
														
 
															-EVAL_HP_NUM_NEG = "eval_hp_num_neg"
														
 
															-
														
 
															-MODEL_HP_MF_DIM = "model_hp_mf_dim"
														
 
															-MODEL_HP_MLP_LAYER_SIZES = "model_hp_mlp_layer_sizes"
														
 
															-
														
 
															-
														
 
															-# RESNET specific
														
 
															-
														
 
															-EVAL_EPOCH_OFFSET = "eval_offset"
														
 
															-
														
 
															-MODEL_HP_INITIAL_MAX_POOL = "model_hp_initial_max_pool"
														
 
															-MODEL_HP_BEGIN_BLOCK = "model_hp_begin_block"
														
 
															-MODEL_HP_END_BLOCK = "model_hp_end_block"
														
 
															-MODEL_HP_BLOCK_TYPE = "model_hp_block_type"
														
 
															-MODEL_HP_PROJECTION_SHORTCUT = "model_hp_projection_shortcut"
														
 
															-MODEL_HP_SHORTCUT_ADD = "model_hp_shorcut_add"
														
 
															-MODEL_HP_RESNET_TOPOLOGY = "model_hp_resnet_topology"
														
 
															-
														
 
															-
														
 
															-# Transformer specific
														
 
															-
														
 
															-INPUT_MAX_LENGTH = "input_max_length"
														
 
															-
														
 
															-MODEL_HP_INITIALIZER_GAIN = "model_hp_initializer_gain"
														
 
															-MODEL_HP_VOCAB_SIZE = "model_hp_vocab_size"
														
 
															-MODEL_HP_NUM_HIDDEN_LAYERS = "model_hp_hidden_layers"
														
 
															-MODEL_HP_EMBEDDING_SHARED_WEIGHTS = "model_hp_embedding_shared_weights"
														
 
															-MODEL_HP_ATTENTION_DENSE = "model_hp_attention_dense"
														
 
															-MODEL_HP_ATTENTION_DROPOUT = "model_hp_attention_dropout"
														
 
															-MODEL_HP_FFN_OUTPUT_DENSE = "model_hp_ffn_output_dense"
														
 
															-MODEL_HP_FFN_FILTER_DENSE = "model_hp_ffn_filter_dense"
														
 
															-MODEL_HP_RELU_DROPOUT = "model_hp_relu_dropout"
														
 
															-MODEL_HP_LAYER_POSTPROCESS_DROPOUT = "model_hp_layer_postprocess_dropout"
														
 
															-MODEL_HP_NORM = "model_hp_norm"
														
 
															-MODEL_HP_SEQ_BEAM_SEARCH = "model_hp_sequence_beam_search"
														
 
															-
														
--- a/PyTorch/Recommendation/NCF/ncf.py
+++ b/PyTorch/Recommendation/NCF/ncf.py
@@ -30,11 +30,10 @@
 
															 import torch.jit
														
 
															 from apex.optimizers import FusedAdam
														
 
															-import logging
														
 
															 import os
														
 
															-import sys
														
 
															 import math
														
 
															 import time
														
 
															+import numpy as np
														
 
															 from argparse import ArgumentParser
														
 
															 import torch
														
@@ -44,15 +43,11 @@ import utils
 
															 import dataloading
														
 
															 from neumf import NeuMF
														
 
															-from logger.logger import LOGGER, timed_block, timed_function
														
 
															-from logger import tags
														
 
															-from logger.autologging import log_hardware, log_args
														
 
															+import dllogger
														
 
															 from apex.parallel import DistributedDataParallel as DDP
														
 
															 from apex import amp
														
 
															-LOGGER.model = 'ncf'
														
 
															-
														
 
															 def parse_args():
														
 
															     parser = ArgumentParser(description="Train a Nerual Collaborative"
														
 
															                                         " Filtering model")
														
@@ -98,36 +93,29 @@ def parse_args():
 
															     parser.add_argument('--opt_level', default='O2', type=str,
														
 
															                         help='Optimization level for Automatic Mixed Precision',
														
 
															                         choices=['O0', 'O2'])
														
 
															-    parser.add_argument('--local_rank', default=0, type=int, help='Necessary for multi-GPU training')
														
 
															+    parser.add_argument('--log_path', default='log.json', type=str,
														
 
															+                        help='Path for the JSON training log')
														
 
															     return parser.parse_args()
														
 
															-def init_distributed(local_rank=0):
														
 
															-    distributed = int(os.environ['WORLD_SIZE']) > 1
														
 
															+def init_distributed(args):
														
 
															+    args.world_size = int(os.environ['WORLD_SIZE'])
														
 
															+    args.distributed = args.world_size > 1
														
 
															+
														
 
															+    if args.distributed:
														
 
															+        args.local_rank = int(os.environ['LOCAL_RANK'])
														
 
															-    if distributed:
														
 
															         '''
														
 
															         Set cuda device so everything is done on the right GPU.
														
 
															         THIS MUST BE DONE AS SOON AS POSSIBLE.
														
 
															         '''
														
 
															-        torch.cuda.set_device(local_rank)
														
 
															+        torch.cuda.set_device(args.local_rank)
														
 
															         '''Initialize distributed communication'''
														
 
															         torch.distributed.init_process_group(backend='nccl',
														
 
															                                              init_method='env://')
														
 
															-    logging_logger = logging.getLogger('mlperf_compliance')
														
 
															-    if local_rank > 0:
														
 
															-        sys.stdout = open('/dev/null', 'w')
														
 
															-        sys.stderr = open('/dev/null', 'w')
														
 
															-        logging_logger.setLevel(logging.ERROR)
														
 
															-
														
 
															-    logging_nvlogger = logging.getLogger('nv_dl_logger')
														
 
															-    if local_rank > 0:
														
 
															-        sys.stdout = open('/dev/null', 'w')
														
 
															-        sys.stderr = open('/dev/null', 'w')
														
 
															-        logging_nvlogger.setLevel(logging.ERROR)
														
 
															-    
														
 
															-    return distributed, int(os.environ['WORLD_SIZE'])
														
 
															+    else:
														
 
															+        args.local_rank = 0
														
 
															 def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user,
														
@@ -152,10 +140,6 @@ def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user
 
															         hits = ifzero.sum()
														
 
															         ndcg = (math.log(2) / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum()
														
 
															-    LOGGER.log(key=tags.EVAL_SIZE, value={"epoch": epoch, "value": num_user * samples_per_user})
														
 
															-    LOGGER.log(key=tags.EVAL_HP_NUM_USERS, value=num_user)
														
 
															-    LOGGER.log(key=tags.EVAL_HP_NUM_NEG, value=samples_per_user - 1)
														
 
															-
														
 
															     if distributed:
														
 
															         torch.distributed.all_reduce(hits, op=torch.distributed.reduce_op.SUM)
														
 
															         torch.distributed.all_reduce(ndcg, op=torch.distributed.reduce_op.SUM)
														
@@ -168,10 +152,17 @@ def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user
 
															 def main():
														
 
															-    log_hardware()
														
 
															     args = parse_args()
														
 
															-    args.distributed, args.world_size = init_distributed(args.local_rank)
														
 
															-    log_args(args)
														
 
															+    init_distributed(args)
														
 
															+
														
 
															+    if args.local_rank == 0:
														
 
															+        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
														
 
															+                                                           filename=args.log_path),
														
 
															+                                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
														
 
															+    else:
														
 
															+        dllogger.init(backends=[])
														
 
															+
														
 
															+    dllogger.log(data=vars(args), step='PARAMETER')
														
 
															     if args.seed is not None:
														
 
															         torch.manual_seed(args.seed)
														
@@ -180,31 +171,22 @@ def main():
 
															     if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '':
														
 
															         os.makedirs(args.checkpoint_dir, exist_ok=True)
														
 
															-    # The default of np.random.choice is replace=True, so does pytorch random_()
														
 
															-    LOGGER.log(key=tags.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True)
														
 
															-    LOGGER.log(key=tags.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True)
														
 
															-    LOGGER.log(key=tags.INPUT_STEP_EVAL_NEG_GEN)
														
 
															-
														
 
															     # sync workers before timing
														
 
															     if args.distributed:
														
 
															         torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
														
 
															     torch.cuda.synchronize()
														
 
															     main_start_time = time.time()
														
 
															-    LOGGER.log(key=tags.RUN_START)
														
 
															     train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
														
 
															     test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
														
 
															     test_negs = torch.load(args.data+'/test_negatives.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
														
 
															     valid_negative = test_negs.shape[1]
														
 
															-    LOGGER.log(key=tags.PREPROC_HP_NUM_EVAL, value=valid_negative)
														
 
															-
														
 
															     nb_maxs = torch.max(train_ratings, 0)[0]
														
 
															     nb_users = nb_maxs[0].item() + 1
														
 
															     nb_items = nb_maxs[1].item() + 1
														
 
															-    LOGGER.log(key=tags.INPUT_SIZE, value=len(train_ratings))
														
 
															     all_test_users = test_ratings.shape[0]
														
@@ -213,9 +195,6 @@ def main():
 
															     # make pytorch memory behavior more consistent later
														
 
															     torch.cuda.empty_cache()
														
 
															-    LOGGER.log(key=tags.INPUT_BATCH_SIZE, value=args.batch_size)
														
 
															-    LOGGER.log(key=tags.INPUT_ORDER)  # we shuffled later with randperm
														
 
															-
														
 
															     # Create model
														
 
															     model = NeuMF(nb_users, nb_items,
														
 
															                   mf_dim=args.factors,
														
@@ -243,12 +222,6 @@ def main():
 
															     print(model)
														
 
															     print("{} parameters".format(utils.count_parameters(model)))
														
 
															-    LOGGER.log(key=tags.OPT_LR, value=args.learning_rate)
														
 
															-    LOGGER.log(key=tags.OPT_NAME, value="Adam")
														
 
															-    LOGGER.log(key=tags.OPT_HP_ADAM_BETA1, value=args.beta1)
														
 
															-    LOGGER.log(key=tags.OPT_HP_ADAM_BETA2, value=args.beta2)
														
 
															-    LOGGER.log(key=tags.OPT_HP_ADAM_EPSILON, value=args.eps)
														
 
															-    LOGGER.log(key=tags.MODEL_HP_LOSS_FN, value=tags.VALUE_BCE)
														
 
															     if args.load_checkpoint_path:
														
 
															         state_dict = torch.load(args.load_checkpoint_path)
														
@@ -256,33 +229,24 @@ def main():
 
															         model.load_state_dict(state_dict)
														
 
															     if args.mode == 'test':
														
 
															-        LOGGER.log(key=tags.EVAL_START, value=0)
														
 
															         start = time.time()
														
 
															         hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
														
 
															                              samples_per_user=valid_negative + 1,
														
 
															                              num_user=all_test_users, distributed=args.distributed)
														
 
															-        print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'
														
 
															-              .format(K=args.topk, hit_rate=hr, ndcg=ndcg))
														
 
															         val_time = time.time() - start
														
 
															         eval_size = all_test_users * (valid_negative + 1)
														
 
															         eval_throughput = eval_size / val_time
														
 
															-        LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": 0, "value": hr})
														
 
															-        LOGGER.log(key=tags.EVAL_STOP, value=0)
														
 
															-        LOGGER.log(key='best_eval_throughput', value=eval_throughput)
														
 
															+        dllogger.log(step=tuple(), data={'best_eval_throughput' : eval_throughput,
														
 
															+                                         'hr@10' : hr})
														
 
															         return
														
 
															-    success = False
														
 
															     max_hr = 0
														
 
															+    best_epoch = 0
														
 
															     train_throughputs, eval_throughputs = [], []
														
 
															-    LOGGER.log(key=tags.TRAIN_LOOP)
														
 
															     for epoch in range(args.epochs):
														
 
															-        LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)
														
 
															-        LOGGER.log(key=tags.INPUT_HP_NUM_NEG, value=args.negative_samples)
														
 
															-        LOGGER.log(key=tags.INPUT_STEP_TRAIN_NEG_GEN)
														
 
															-
														
 
															         begin = time.time()
														
 
															         epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(train_ratings, nb_items, args)
														
@@ -315,32 +279,28 @@ def main():
 
															         epoch_samples = len(train_ratings) * (args.negative_samples + 1)
														
 
															         train_throughput = epoch_samples / train_time
														
 
															         train_throughputs.append(train_throughput)
														
 
															-        LOGGER.log(key='train_throughput', value=train_throughput)
														
 
															-        LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
														
 
															-        LOGGER.log(key=tags.EVAL_START, value=epoch)
														
 
															         hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
														
 
															                              samples_per_user=valid_negative + 1,
														
 
															                              num_user=all_test_users, epoch=epoch, distributed=args.distributed)
														
 
															         val_time = time.time() - begin
														
 
															-        print('Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
														
 
															-              ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'
														
 
															-              .format(epoch=epoch, K=args.topk, hit_rate=hr,
														
 
															-                      ndcg=ndcg, train_time=train_time,
														
 
															-                      val_time=val_time))
														
 
															-        LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": epoch, "value": hr})
														
 
															-        LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold)
														
 
															-        LOGGER.log(key=tags.EVAL_STOP, value=epoch)
														
 
															         eval_size = all_test_users * (valid_negative + 1)
														
 
															         eval_throughput = eval_size / val_time
														
 
															         eval_throughputs.append(eval_throughput)
														
 
															-        LOGGER.log(key='eval_throughput', value=eval_throughput)
														
 
															+
														
 
															+        dllogger.log(step=(epoch,),
														
 
															+                     data = {'train_throughput': train_throughput,
														
 
															+                             'hr@10': hr,
														
 
															+                             'train_epoch_time': train_time,
														
 
															+                             'validation_epoch_time': val_time,
														
 
															+                             'eval_throughput': eval_throughput})
														
 
															         if hr > max_hr and args.local_rank == 0:
														
 
															             max_hr = hr
														
 
															+            best_epoch = epoch
														
 
															             save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth')
														
 
															             print("New best hr! Saving the model to: ", save_checkpoint_path)
														
 
															             torch.save(model.state_dict(), save_checkpoint_path)
														
@@ -349,18 +309,19 @@ def main():
 
															         if args.threshold is not None:
														
 
															             if hr >= args.threshold:
														
 
															                 print("Hit threshold of {}".format(args.threshold))
														
 
															-                success = True
														
 
															                 break
														
 
															     if args.local_rank == 0:
														
 
															-        LOGGER.log(key='best_train_throughput', value=max(train_throughputs))
														
 
															-        LOGGER.log(key='best_eval_throughput', value=max(eval_throughputs))
														
 
															-        LOGGER.log(key='best_accuracy', value=max_hr)
														
 
															-        LOGGER.log(key='time_to_target', value=time.time() - main_start_time)
														
 
															-        LOGGER.log(key='time_to_best_model', value=best_model_timestamp - main_start_time)
														
 
															-
														
 
															-        LOGGER.log(key=tags.RUN_STOP, value={"success": success})
														
 
															-        LOGGER.log(key=tags.RUN_FINAL)
														
 
															+        dllogger.log(data={'best_train_throughput': max(train_throughputs),
														
 
															+                           'best_eval_throughput': max(eval_throughputs),
														
 
															+                           'mean_train_throughput': np.mean(train_throughputs),
														
 
															+                           'mean_eval_throughput': np.mean(eval_throughputs),
														
 
															+                           'best_accuracy': max_hr,
														
 
															+                           'best_epoch': best_epoch,
														
 
															+                           'time_to_target': time.time() - main_start_time,
														
 
															+                           'time_to_best_model': best_model_timestamp - main_start_time},
														
 
															+                     step=tuple())
														
 
															+
														
 
															 if __name__ == '__main__':
														
 
															     main()
														
--- a/PyTorch/Recommendation/NCF/neumf.py
+++ b/PyTorch/Recommendation/NCF/neumf.py
@@ -34,13 +34,6 @@ import torch.nn as nn
 
															 import sys
														
 
															 from os.path import abspath, join, dirname
														
 
															-# enabling modules discovery from the global entrypoint
														
 
															-sys.path.append(abspath(dirname(__file__) + '/'))
														
 
															-
														
 
															-from logger.logger import LOGGER
														
 
															-from logger import tags
														
 
															-
														
 
															-LOGGER.model = 'ncf'
														
 
															 class NeuMF(nn.Module):
														
 
															     def __init__(self, nb_users, nb_items,
														
@@ -51,15 +44,12 @@ class NeuMF(nn.Module):
 
															         super(NeuMF, self).__init__()
														
 
															         nb_mlp_layers = len(mlp_layer_sizes)
														
 
															-        LOGGER.log(key=tags.MODEL_HP_MF_DIM, value=mf_dim)
														
 
															-
														
 
															         self.mf_user_embed = nn.Embedding(nb_users, mf_dim)
														
 
															         self.mf_item_embed = nn.Embedding(nb_items, mf_dim)
														
 
															         self.mlp_user_embed = nn.Embedding(nb_users, mlp_layer_sizes[0] // 2)
														
 
															         self.mlp_item_embed = nn.Embedding(nb_items, mlp_layer_sizes[0] // 2)
														
 
															         self.dropout = dropout
														
 
															-        LOGGER.log(key=tags.MODEL_HP_MLP_LAYER_SIZES, value=mlp_layer_sizes)
														
 
															         self.mlp = nn.ModuleList()
														
 
															         for i in range(1, nb_mlp_layers):
														
 
															             self.mlp.extend([nn.Linear(mlp_layer_sizes[i - 1], mlp_layer_sizes[i])])  # noqa: E501
														
--- a/PyTorch/Recommendation/NCF/prepare_dataset.sh
+++ b/PyTorch/Recommendation/NCF/prepare_dataset.sh
@@ -85,6 +85,6 @@ else
 
															 fi
														
 
															 echo "Dataset $DATASET_NAME successfully prepared at: $CACHED_DATADIR\n"
														
 
															-echo "You can now run the training with: python -m torch.distributed.launch --nproc_per_node=<number_of_GPUs> ncf.py --data ${CACHED_DATADIR}"
														
 
															+echo "You can now run the training with: python -m torch.distributed.launch --nproc_per_node=<number_of_GPUs> --use_env ncf.py --data ${CACHED_DATADIR}"
														
--- a/PyTorch/Recommendation/NCF/requirements.txt
+++ b/PyTorch/Recommendation/NCF/requirements.txt
@@ -1,2 +1,3 @@
 
															 pandas
														
 
															 tqdm
														
 
															+-e git://github.com/NVIDIA/dllogger#egg=dllogger