6 ani în urmă · 70f3e4362c
--- a/TensorFlow/Segmentation/UNet_Industrial/Dockerfile
+++ b/TensorFlow/Segmentation/UNet_Industrial/Dockerfile
@@ -16,11 +16,15 @@
 
				 #
			
 
				 # ==============================================================================
			
 
				 
			
 
				-FROM nvcr.io/nvidia/tensorflow:19.05-py3
			
 
				+FROM nvcr.io/nvidia/tensorflow:20.01-tf1-py3
			
 
				 
			
 
				 LABEL version="1.0" maintainer="Jonathan DEKHTIAR <[email protected]>"
			
 
				 
			
 
				+WORKDIR /opt
			
 
				+COPY requirements.txt /opt/requirements_unet_tf_industrial.txt
			
 
				+
			
 
				+RUN python -m pip --no-cache-dir --no-cache install --upgrade pip && \
			
 
				+    pip --no-cache-dir --no-cache install -r /opt/requirements_unet_tf_industrial.txt
			
 
				+
			
 
				 ADD . /workspace/unet_industrial
			
 
				 WORKDIR /workspace/unet_industrial
			
 
				-
			
 
				-RUN pip install dllogger/
			
--- a/TensorFlow/Segmentation/UNet_Industrial/README.md
+++ b/TensorFlow/Segmentation/UNet_Industrial/README.md
@@ -138,7 +138,7 @@ Aside from these dependencies, ensure you have the following components:
 
				 
			
 
				 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
			
 
				 
			
 
				-* [TensorFlow 19.03-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
			
 
				+* [TensorFlow 19.12-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
			
 
				 * (optional) NVIDIA Volta GPU (see section below) - for best training performance using mixed precision
			
 
				 
			
 
				 For more information about how to get started with NGC containers, see the
			
@@ -219,11 +219,6 @@ cd scripts/
 
				 ./UNet_FP32_EVAL.sh <path to result repository> <path to dataset> <DAGM2007 classID (1-10)>
			
 
				 ```
			
 
				 
			
 
				-If you wish to evaluate external checkpoint, make sure to put the TF ckpt files inside a folder named "checkpoints"
			
 
				-and provide its parent path as `<path to result repository>` in the example above. 
			
 
				-Be aware that the script will not fail if it does not find the checkpoint. 
			
 
				-It will randomly initialize the weights and run performance tests.
			
 
				-
			
 
				 ## Advanced
			
 
				 
			
 
				 The following sections provide greater details of the dataset, running training and inference, and the training results.
			
@@ -374,7 +369,7 @@ The following sections provide details on the achieved results in training accur
 
				 #### Training accuracy results
			
 
				 
			
 
				 Our results were obtained by running the `./scripts/UNet_{FP32, AMP}_{1, 4, 8}GPU.sh` training
			
 
				-script in the Tensorflow:19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
			
 
				+script in the Tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
			
 
				 
			
 
				 ##### Threshold = 0.75
			
 
				 
			
@@ -481,30 +476,29 @@ script in the Tensorflow:19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 16
 
				 <!-- Spreedsheet to Markdown: https://thisdavej.com/copy-table-in-excel-and-paste-as-a-markdown-table/ -->
			
 
				 
			
 
				 Our results were obtained by running the scripts
			
 
				-`./scripts/benchmarking/DGX1v_trainbench_{FP16, FP32, FP32AMP, FP32FM}_{1, 4, 8}GPU.sh` training script in the
			
 
				-TensorFlow 19.03-py3 NGC container on an NVIDIA DGX-1 with 8 V100 16G GPUs.
			
 
				-
			
 
				-
			
 
				-| # GPUs | Precision                       | Throughput (Imgs/sec) | Training Time | Speedup |
			
 
				-|--------|---------------------------------|-----------------------|---------------|---------|
			
 
				-| 1      | FP32                            | 89                    | 7m44          | 1.00    |
			
 
				-| 1      | Automatic Mixed Precision (AMP) | 104                   | 6m40          | 1.17    |
			
 
				-| 4      | FP32                            | 261                   | 2m48          | 1.00    |
			
 
				-| 4      | Automatic Mixed Precision (AMP) | 302                   | 2m27          | 1.16    |
			
 
				-| 8      | FP32                            | 445                   | 1m44          | 1.00    |
			
 
				-| 8      | Automatic Mixed Precision (AMP) | 491                   | 1m36          | 1.10    |
			
 
				+`./scripts/benchmarking/DGX1v_trainbench_{FP32, AMP}_{1, 4, 8}GPU.sh` training script in the
			
 
				+TensorFlow `19.12-tf1-py3` NGC container on an NVIDIA DGX-1 with 8 V100 16G GPUs.
			
 
				+
			
 
				+| # GPUs | Precision                       | Throughput (Imgs/sec) | AMP Speedup | Scaling efficiency |
			
 
				+|--------|---------------------------------|-----------------------|-------------|--------------------|
			
 
				+| 1      | FP32                            | 92                    | 1.00        | 1.00               |
			
 
				+| 1      | Automatic Mixed Precision (AMP) | 167                   | 1.82        | 1.00               |
			
 
				+| 4      | FP32                            | 299                   | 1.00        | 3.25               |
			
 
				+| 4      | Automatic Mixed Precision (AMP) | 458                   | 1.53        | 2.74               |
			
 
				+| 8      | FP32                            | 507                   | 1.00        | 5.51               |
			
 
				+| 8      | Automatic Mixed Precision (AMP) | 561                   | 1.11        | 3.36               |
			
 
				 
			
 
				 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
			
 
				 
			
 
				 #### Inference performance results
			
 
				 
			
 
				-Our results were obtained by running the aforementioned scripts in the TensorFlow 
			
 
				-19.03-py3 NGC container on an NVIDIA DGX-1 server with 8 V100 16G GPUs.
			
 
				+Our results were obtained by running the scripts `./scripts/benchmarking/DGX1v_evalbench_{FP32, AMP}.sh`
			
 
				+evaluation script in the `19.12-tf1-py3` NGC container on an NVIDIA DGX-1 server with 8 V100 16G GPUs.
			
 
				 
			
 
				 | # GPUs | Precision                       | Throughput (Imgs/sec) | Speedup |
			
 
				 |--------|---------------------------------|-----------------------|---------|
			
 
				-| 1      | FP32                            | 228                   | 1.00    |
			
 
				-| 1      | Automatic Mixed Precision (AMP) | 301                   | 1.32    |
			
 
				+| 1      | FP32                            | 306                   | 1.00    |
			
 
				+| 1      | Automatic Mixed Precision (AMP) | 550                   | 1.80    |
			
 
				 
			
 
				 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
			
 
				 
			
--- a/TensorFlow/Segmentation/UNet_Industrial/datasets/core.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/datasets/core.py
@@ -19,8 +19,6 @@
 
				 #
			
 
				 # ==============================================================================
			
 
				 
			
 
				-from __future__ import print_function
			
 
				-
			
 
				 import os
			
 
				 from abc import ABC, abstractmethod
			
 
				 
			
--- a/TensorFlow/Segmentation/UNet_Industrial/datasets/dagm2007.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/datasets/dagm2007.py
@@ -37,7 +37,7 @@ from datasets.core import BaseDataset
 
				 
			
 
				 from utils import hvd_utils
			
 
				 
			
 
				-from dllogger.logger import LOGGER
			
 
				+from dllogger import Logger
			
 
				 
			
 
				 __all__ = ['DAGM2007_Dataset']
			
 
				 
			
@@ -109,7 +109,21 @@ class DAGM2007_Dataset(BaseDataset):
 
				 
			
 
				         shuffle_buffer_size = 10000
			
 
				 
			
 
				-        def decode_csv(line):
			
 
				+        image_dir, csv_file = self._get_data_dirs(training=training)
			
 
				+
			
 
				+        mask_image_dir = os.path.join(image_dir, "Label")
			
 
				+
			
 
				+        dataset = tf.data.TextLineDataset(csv_file)
			
 
				+
			
 
				+        dataset = dataset.skip(1)  # Skip CSV Header
			
 
				+
			
 
				+        if only_defective_images:
			
 
				+            dataset = dataset.filter(lambda line: tf.not_equal(tf.strings.substr(line, -1, 1), "0"))
			
 
				+
			
 
				+        if hvd_utils.is_using_hvd() and training:
			
 
				+            dataset = dataset.shard(hvd.size(), hvd.rank())
			
 
				+
			
 
				+        def _load_dagm_data(line):
			
 
				 
			
 
				             input_image_name, image_mask_name, label = tf.decode_csv(
			
 
				                 line, record_defaults=[[""], [""], [0]], field_delim=','
			
@@ -156,10 +170,33 @@ class DAGM2007_Dataset(BaseDataset):
 
				                 ),
			
 
				             )
			
 
				 
			
 
				+            label = tf.cast(label, tf.int32)
			
 
				+
			
 
				+            return tf.data.Dataset.from_tensor_slices(([input_image], [mask_image], [label]))
			
 
				+
			
 
				+        dataset = dataset.apply(
			
 
				+            tf.data.experimental.parallel_interleave(
			
 
				+                _load_dagm_data,
			
 
				+                cycle_length=batch_size*8,
			
 
				+                block_length=4,
			
 
				+                buffer_output_elements=batch_size*8
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        dataset = dataset.cache()
			
 
				+
			
 
				+        if training:
			
 
				+            dataset = dataset.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=shuffle_buffer_size, seed=seed))
			
 
				+
			
 
				+        else:
			
 
				+            dataset = dataset.repeat()
			
 
				+
			
 
				+        def _augment_data(input_image, mask_image, label):
			
 
				+
			
 
				             if augment_data:
			
 
				 
			
 
				-                if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
			
 
				-                    LOGGER.log("Using data augmentation ...")
			
 
				+                if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				+                    print("Using data augmentation ...")
			
 
				 
			
 
				                 #input_image = tf.image.per_image_standardization(input_image)
			
 
				 
			
@@ -173,36 +210,11 @@ class DAGM2007_Dataset(BaseDataset):
 
				                 input_image = tf.image.rot90(input_image, k=n_rots)
			
 
				                 mask_image = tf.image.rot90(mask_image, k=n_rots)
			
 
				 
			
 
				-            label = tf.cast(label, tf.int32)
			
 
				-
			
 
				             return (input_image, mask_image), label
			
 
				 
			
 
				-        image_dir, csv_file = self._get_data_dirs(training=training)
			
 
				-
			
 
				-        mask_image_dir = os.path.join(image_dir, "Label")
			
 
				-
			
 
				-        dataset = tf.data.TextLineDataset(csv_file)
			
 
				-
			
 
				-        dataset = dataset.skip(1)  # Skip CSV Header
			
 
				-
			
 
				-        if only_defective_images:
			
 
				-            dataset = dataset.filter(lambda line: tf.not_equal(tf.strings.substr(line, -1, 1), "0"))
			
 
				-
			
 
				-        dataset = dataset.cache()
			
 
				-
			
 
				-        if training:
			
 
				-
			
 
				-            dataset = dataset.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=shuffle_buffer_size, seed=seed))
			
 
				-
			
 
				-            if hvd_utils.is_using_hvd():
			
 
				-                dataset = dataset.shard(hvd.size(), hvd.rank())
			
 
				-
			
 
				-        else:
			
 
				-            dataset = dataset.repeat()
			
 
				-
			
 
				         dataset = dataset.apply(
			
 
				             tf.data.experimental.map_and_batch(
			
 
				-                map_func=decode_csv,
			
 
				+                map_func=_augment_data,
			
 
				                 num_parallel_calls=num_threads,
			
 
				                 batch_size=batch_size,
			
 
				                 drop_remainder=True,
			
@@ -212,7 +224,7 @@ class DAGM2007_Dataset(BaseDataset):
 
				         dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)
			
 
				 
			
 
				         if use_gpu_prefetch:
			
 
				-            dataset.apply(tf.data.experimental.prefetch_to_device(device="/gpu:0", buffer_size=batch_size * 8))
			
 
				+            dataset.apply(tf.data.experimental.prefetch_to_device(device="/gpu:0", buffer_size=4))
			
 
				 
			
 
				         return dataset
			
 
				 
			
--- a/TensorFlow/Segmentation/UNet_Industrial/dllogger/README.md
+++ b/TensorFlow/Segmentation/UNet_Industrial/dllogger/README.md
@@ -1,22 +0,0 @@
 
				-# Tools for logging DL training
			
 
				-DLLogger is a tool to generate logs during Deep Learning training.
			
 
				-
			
 
				-## Installation
			
 
				-```
			
 
				-git clone https://gitlab-master.nvidia.com/dl/JoC/DLLogger.git
			
 
				-pip install DLLogger/.
			
 
				-```
			
 
				-
			
 
				-## Usage
			
 
				-You can use DLLogger with the simplest `LOGGER.log()` API:
			
 
				-```
			
 
				-from logger.logger import LOGGER
			
 
				-from logger import tags
			
 
				-
			
 
				-LOGGER.model = 'ResNet'
			
 
				-LOGGER.log(key=tags.INPUT_BATCH_SIZE, value=128)
			
 
				-```
			
 
				-For the more advanced usage, please refer to the `dummy_run.py` example.
			
 
				-
			
 
				-## Tags
			
 
				-All available tags are listed in the `logger/tags.py` file.
			
--- a/TensorFlow/Segmentation/UNet_Industrial/dllogger/dllogger/__init__.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/dllogger/dllogger/__init__.py
@@ -1,19 +0,0 @@
 
				-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-
			
 
				-from .logger import LOGGER, StdOutBackend, MLPerfBackend, JsonBackend, CompactBackend, Scope, AverageMeter, StandardMeter
			
 
				-from . import tags
			
 
				-
			
 
				-__all__ = [LOGGER, StdOutBackend, MLPerfBackend, JsonBackend, CompactBackend, Scope, AverageMeter, StandardMeter, tags]
			
--- a/TensorFlow/Segmentation/UNet_Industrial/dllogger/dllogger/autologging.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/dllogger/dllogger/autologging.py
@@ -1,60 +0,0 @@
 
				-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-# Common values reported
			
 
				-
			
 
				-
			
 
				-import subprocess
			
 
				-import xml.etree.ElementTree as ET
			
 
				-
			
 
				-#TODO: print CUDA version, container version etc
			
 
				-
			
 
				-def log_hardware(logger):
			
 
				-    # TODO: asserts - what if you cannot launch those commands?
			
 
				-    # number of CPU threads
			
 
				-    cpu_info_command = 'cat /proc/cpuinfo'
			
 
				-    cpu_info = subprocess.run(cpu_info_command.split(), stdout=subprocess.PIPE).stdout.split()
			
 
				-    cpu_num_index = len(cpu_info) - cpu_info[::-1].index(b'processor') + 1
			
 
				-    cpu_num = int(cpu_info[cpu_num_index]) + 1
			
 
				-
			
 
				-    # CPU name
			
 
				-    cpu_name_begin_index = cpu_info.index(b'name')
			
 
				-    cpu_name_end_index = cpu_info.index(b'stepping')
			
 
				-    cpu_name = b' '.join(cpu_info[cpu_name_begin_index + 2:cpu_name_end_index]).decode('utf-8')
			
 
				-
			
 
				-    logger.log(key='cpu_info', value={"num": cpu_num, "name": cpu_name})
			
 
				-
			
 
				-    # RAM memory
			
 
				-    ram_info_command = 'free -m -h'
			
 
				-    ram_info = subprocess.run(ram_info_command.split(), stdout=subprocess.PIPE).stdout.split()
			
 
				-    ram_index = ram_info.index(b'Mem:') + 1
			
 
				-    ram = ram_info[ram_index].decode('utf-8')
			
 
				-
			
 
				-    logger.log(key='mem_info', value={"ram": ram})
			
 
				-
			
 
				-    # GPU
			
 
				-    nvidia_smi_command = 'nvidia-smi -q -x'
			
 
				-    nvidia_smi_output = subprocess.run(nvidia_smi_command.split(), stdout=subprocess.PIPE).stdout
			
 
				-    nvidia_smi = ET.fromstring(nvidia_smi_output)
			
 
				-    gpus = nvidia_smi.findall('gpu')
			
 
				-    ver = nvidia_smi.findall('driver_version')
			
 
				-
			
 
				-    logger.log(key="gpu_info",
			
 
				-                 value={
			
 
				-                      "driver_version": ver[0].text,
			
 
				-                      "num": len(gpus),
			
 
				-                      "name": [g.find('product_name').text for g in gpus],
			
 
				-                      "mem": [g.find('fb_memory_usage').find('total').text for g in gpus]})
			
 
				-
			
 
				-def log_args(logger, args):
			
 
				-    logger.log(key='args', value=vars(args))
			
--- a/TensorFlow/Segmentation/UNet_Industrial/dllogger/dllogger/logger.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/dllogger/dllogger/logger.py
@@ -1,531 +0,0 @@
 
				-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-
			
 
				-import time
			
 
				-import json
			
 
				-import logging
			
 
				-import inspect
			
 
				-import sys
			
 
				-from contextlib import contextmanager
			
 
				-import functools
			
 
				-from collections import OrderedDict
			
 
				-import datetime
			
 
				-
			
 
				-from . import autologging
			
 
				-
			
 
				-NVLOGGER_NAME = 'nv_dl_logger'
			
 
				-NVLOGGER_VERSION = '0.3.1'
			
 
				-NVLOGGER_TOKEN = ':::NVLOG'
			
 
				-
			
 
				-MLPERF_NAME = 'mlperf_logger'
			
 
				-MLPERF_VERSION = '0.5.0'
			
 
				-MLPERF_TOKEN = ':::MLP'
			
 
				-
			
 
				-COMPACT_NAME = 'compact_logger'
			
 
				-
			
 
				-DEFAULT_JSON_FILENAME = 'nvlog.json'
			
 
				-
			
 
				-class Scope:
			
 
				-    RUN = 0
			
 
				-    EPOCH = 1
			
 
				-    TRAIN_ITER = 2
			
 
				-
			
 
				-
			
 
				-class Level:
			
 
				-    CRITICAL = 5
			
 
				-    ERROR = 4
			
 
				-    WARNING = 3
			
 
				-    INFO = 2
			
 
				-    DEBUG = 1
			
 
				-
			
 
				-
			
 
				-_data = OrderedDict([
			
 
				-    ('model', None),
			
 
				-    ('epoch', -1),
			
 
				-    ('iteration', -1),
			
 
				-    ('total_iteration', -1),
			
 
				-    ('metrics', OrderedDict()),
			
 
				-    ('timed_blocks', OrderedDict()),
			
 
				-    ('current_scope', Scope.RUN)
			
 
				-    ])
			
 
				-
			
 
				-def get_caller(root_dir=None):
			
 
				-    stack_files = [s.filename.split('/')[-1] for s in inspect.stack()]
			
 
				-    stack_index = 0
			
 
				-    while stack_index < len(stack_files) and stack_files[stack_index] != 'logger.py':
			
 
				-        stack_index += 1
			
 
				-
			
 
				-    while (stack_index < len(stack_files) and 
			
 
				-            stack_files[stack_index] in ['logger.py', 'autologging.py', 'contextlib.py']):
			
 
				-        stack_index += 1
			
 
				-
			
 
				-    while True:
			
 
				-        try:
			
 
				-            caller_line = inspect.stack()[stack_index].lineno
			
 
				-            caller_file = stack_files[stack_index]
			
 
				-            break
			
 
				-        except IndexError:
			
 
				-            stack_index -= 1
			
 
				-
			
 
				-        if stack_index < 0:
			
 
				-            caller_line = 0
			
 
				-            caller_file = "Unknown Calling File"
			
 
				-            break
			
 
				-
			
 
				-    return "%s:%d" % (caller_file, caller_line)
			
 
				-
			
 
				-class StandardMeter(object):
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        self.reset()
			
 
				-
			
 
				-    def reset(self):
			
 
				-        self.value = None
			
 
				-
			
 
				-    def record(self, value):
			
 
				-        self.value = value
			
 
				-
			
 
				-    def get_value(self):
			
 
				-        return self.value
			
 
				-
			
 
				-    def get_last(self):
			
 
				-        return self.value
			
 
				-
			
 
				-class AverageMeter(object):
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        self.reset()
			
 
				-
			
 
				-    def reset(self):
			
 
				-        self.count = 0
			
 
				-        self.value = 0
			
 
				-        self.last = 0
			
 
				-
			
 
				-    def record(self, value, n = 1):
			
 
				-        self.last = value
			
 
				-        self.count += n
			
 
				-        self.value += value * n
			
 
				-
			
 
				-    def get_value(self):
			
 
				-        return self.value / self.count
			
 
				-
			
 
				-    def get_last(self):
			
 
				-        return self.last
			
 
				-
			
 
				-class JsonBackend(object):
			
 
				-
			
 
				-    def __init__(self, log_file=DEFAULT_JSON_FILENAME, logging_scope=Scope.TRAIN_ITER,
			
 
				-            iteration_interval=1):
			
 
				-        self.log_file = log_file
			
 
				-        self.logging_scope = logging_scope
			
 
				-        self.iteration_interval = iteration_interval
			
 
				-
			
 
				-        self.json_log = OrderedDict([
			
 
				-            ('run', OrderedDict()),
			
 
				-            ('epoch', OrderedDict()),
			
 
				-            ('iter', OrderedDict()),
			
 
				-            ('event', OrderedDict()),
			
 
				-            ])
			
 
				-        
			
 
				-        self.json_log['epoch']['x'] = []
			
 
				-        if self.logging_scope == Scope.TRAIN_ITER:
			
 
				-            self.json_log['iter']['x'] = [[]]
			
 
				-
			
 
				-    def register_metric(self, key, metric_scope):
			
 
				-        if (metric_scope == Scope.TRAIN_ITER and
			
 
				-                self.logging_scope == Scope.TRAIN_ITER):
			
 
				-            if not key in self.json_log['iter'].keys():
			
 
				-                self.json_log['iter'][key] = [[]]
			
 
				-        if metric_scope == Scope.EPOCH:
			
 
				-            if not key in self.json_log['epoch'].keys():
			
 
				-                self.json_log['epoch'][key] = []
			
 
				-
			
 
				-    def log(self, key, value):
			
 
				-        if _data['current_scope'] == Scope.RUN:
			
 
				-            self.json_log['run'][key] = value
			
 
				-        elif _data['current_scope'] == Scope.EPOCH: 
			
 
				-            pass
			
 
				-        elif _data['current_scope'] == Scope.TRAIN_ITER:
			
 
				-            pass
			
 
				-        else:
			
 
				-            raise ValueError('log function for scope "', _data['current_scope'], 
			
 
				-                    '" not implemented')
			
 
				-
			
 
				-    def log_event(self, key, value):
			
 
				-        if not key in self.json_log['event'].keys():
			
 
				-            self.json_log['event'][key] = []
			
 
				-        entry = OrderedDict()
			
 
				-        entry['epoch'] = _data['epoch']
			
 
				-        entry['iter'] = _data['iteration']
			
 
				-        entry['timestamp'] = time.time()
			
 
				-        if value:
			
 
				-            entry['value'] = value
			
 
				-        self.json_log['event'][key].append(str(entry))
			
 
				-
			
 
				-    def log_iteration_summary(self):
			
 
				-        if (self.logging_scope == Scope.TRAIN_ITER and 
			
 
				-                _data['total_iteration'] % self.iteration_interval == 0):
			
 
				-            for key, m in _data['metrics'].items():
			
 
				-                if m.metric_scope == Scope.TRAIN_ITER:
			
 
				-                    self.json_log['iter'][key][-1].append(str(m.get_last()))
			
 
				-
			
 
				-            # log x for iteration number
			
 
				-            self.json_log['iter']['x'][-1].append(_data['iteration'])
			
 
				-
			
 
				-
			
 
				-    def dump_json(self):
			
 
				-        if self.log_file is None:
			
 
				-            print(json.dumps(self.json_log, indent=4))
			
 
				-        else:
			
 
				-            with open(self.log_file, 'w') as f:
			
 
				-                json.dump(self.json_log, fp=f, indent=4)
			
 
				-
			
 
				-    def log_epoch_summary(self):
			
 
				-        for key, m in _data['metrics'].items():
			
 
				-            if m.metric_scope == Scope.EPOCH:
			
 
				-                self.json_log['epoch'][key].append(str(m.get_value()))
			
 
				-            elif (m.metric_scope == Scope.TRAIN_ITER and 
			
 
				-                    self.logging_scope == Scope.TRAIN_ITER):
			
 
				-                # create new sublists for each iter metric in the next epoch
			
 
				-                self.json_log['iter'][key].append([])
			
 
				-        
			
 
				-        # log x for epoch number
			
 
				-        self.json_log['epoch']['x'].append(_data['epoch'])
			
 
				-
			
 
				-        # create new sublist for iter's x in the next epoch
			
 
				-        if self.logging_scope == Scope.TRAIN_ITER:
			
 
				-            self.json_log['iter']['x'].append([])
			
 
				-
			
 
				-        self.dump_json()
			
 
				-
			
 
				-    def timed_block_start(self, name):
			
 
				-        pass
			
 
				-
			
 
				-    def timed_block_stop(self, name):
			
 
				-        pass
			
 
				-
			
 
				-    def finish(self):
			
 
				-        self.dump_json()
			
 
				-
			
 
				-class _ParentStdOutBackend(object):
			
 
				-
			
 
				-    def __init__(self, name, token, version, log_file, logging_scope, iteration_interval):
			
 
				-
			
 
				-        self.root_dir = None
			
 
				-        self.worker = [0]
			
 
				-        self.prefix = ''
			
 
				-
			
 
				-        self.name = name
			
 
				-        self.token = token
			
 
				-        self.version = version
			
 
				-        self.log_file = log_file
			
 
				-        self.logging_scope = logging_scope
			
 
				-        self.iteration_interval = iteration_interval
			
 
				-
			
 
				-        self.logger = logging.getLogger(self.name)
			
 
				-        self.logger.setLevel(logging.DEBUG)
			
 
				-        self.logger.handlers = []
			
 
				-
			
 
				-        if (self.log_file is None):
			
 
				-            self.stream_handler = logging.StreamHandler(stream=sys.stdout)
			
 
				-            self.stream_handler.setLevel(logging.DEBUG)
			
 
				-            self.logger.addHandler(self.stream_handler)
			
 
				-        else:
			
 
				-            self.file_handler = logging.FileHandler(self.log_file, mode='w')
			
 
				-            self.file_handler.setLevel(logging.DEBUG)
			
 
				-            self.logger.addHandler(self.file_handler)
			
 
				-
			
 
				-    def register_metric(self, key, meter=None, metric_scope=Scope.EPOCH):
			
 
				-        pass
			
 
				-
			
 
				-    def log_epoch_summary(self):
			
 
				-        pass
			
 
				-
			
 
				-    def log_iteration_summary(self):
			
 
				-        pass
			
 
				-
			
 
				-    def log(self, key, value):
			
 
				-        if _data['current_scope'] > self.logging_scope:
			
 
				-            pass
			
 
				-        elif (_data['current_scope'] == Scope.TRAIN_ITER and 
			
 
				-                _data['total_iteration'] % self.iteration_interval != 0):
			
 
				-            pass
			
 
				-        else:
			
 
				-            self.log_stdout(key, value)
			
 
				-
			
 
				-    def log_event(self, key, value):
			
 
				-        self.log_stdout(key, value)
			
 
				-        
			
 
				-    def log_stdout(self, key, value=None, forced=False):
			
 
				-        # TODO: worker 0 
			
 
				-        # only the 0-worker will log
			
 
				-        #if not forced and self.worker != 0:
			
 
				-        #    pass
			
 
				-
			
 
				-        if value is None:
			
 
				-            msg = key
			
 
				-        else:
			
 
				-            str_json = json.dumps(str(value))
			
 
				-            msg = '{key}: {value}'.format(key=key, value=str_json)
			
 
				-
			
 
				-        call_site = get_caller(root_dir=self.root_dir)
			
 
				-        now = time.time()
			
 
				-
			
 
				-        message = '{prefix}{token}v{ver} {model} {secs:.9f} ({call_site}) {msg}'.format(
			
 
				-            prefix=self.prefix, token=self.token, ver=self.version, secs=now, 
			
 
				-            model=_data['model'],
			
 
				-            call_site=call_site, msg=msg)
			
 
				-
			
 
				-        self.logger.debug(message)
			
 
				-
			
 
				-    def timed_block_start(self, name):
			
 
				-        self.log_stdout(key=name + "_start")
			
 
				-
			
 
				-    def timed_block_stop(self, name):
			
 
				-        self.log_stdout(key=name + "_stop")
			
 
				-
			
 
				-    def finish(self):
			
 
				-        pass
			
 
				-
			
 
				-class StdOutBackend(_ParentStdOutBackend):
			
 
				-
			
 
				-    def __init__(self, log_file=None, logging_scope=Scope.TRAIN_ITER, iteration_interval=1):
			
 
				-        _ParentStdOutBackend.__init__(self, name=NVLOGGER_NAME, token=NVLOGGER_TOKEN, 
			
 
				-                version=NVLOGGER_VERSION, log_file=log_file, logging_scope=logging_scope, 
			
 
				-                iteration_interval=iteration_interval)
			
 
				-        
			
 
				-class MLPerfBackend(_ParentStdOutBackend):
			
 
				-
			
 
				-    def __init__(self, log_file=None, logging_scope=Scope.TRAIN_ITER, iteration_interval=1):
			
 
				-        _ParentStdOutBackend.__init__(self, name=MLPERF_NAME, token=MLPERF_TOKEN, 
			
 
				-                version=MLPERF_VERSION, log_file=log_file, logging_scope=logging_scope, 
			
 
				-                iteration_interval=iteration_interval)
			
 
				-
			
 
				-class CompactBackend(object):
			
 
				-
			
 
				-    def __init__(self, log_file=None, logging_scope=Scope.TRAIN_ITER, iteration_interval=1):
			
 
				-        self.log_file = log_file
			
 
				-        self.logging_scope = logging_scope
			
 
				-        self.iteration_interval = iteration_interval
			
 
				-
			
 
				-        self.logger = logging.getLogger(COMPACT_NAME)
			
 
				-        self.logger.setLevel(logging.DEBUG)
			
 
				-        self.logger.handlers = []
			
 
				-
			
 
				-        if (self.log_file is None):
			
 
				-            self.stream_handler = logging.StreamHandler(stream=sys.stdout)
			
 
				-            self.stream_handler.setLevel(logging.DEBUG)
			
 
				-            self.logger.addHandler(self.stream_handler)
			
 
				-        else:
			
 
				-            self.file_handler = logging.FileHandler(self.log_file, mode='w')
			
 
				-            self.file_handler.setLevel(logging.DEBUG)
			
 
				-            self.logger.addHandler(self.file_handler)
			
 
				-    
			
 
				-    def register_metric(self, key, meter=None, metric_scope=Scope.EPOCH):
			
 
				-        pass
			
 
				-    
			
 
				-    def timestamp_prefix(self):
			
 
				-        return datetime.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')
			
 
				-
			
 
				-    def log(self, key, value):
			
 
				-        if _data['current_scope'] == Scope.RUN:
			
 
				-            self.log_event(key, value)
			
 
				-    
			
 
				-    def log_event(self, key, value):
			
 
				-        msg = self.timestamp_prefix() + ' ' + str(key)
			
 
				-        if value is not None:
			
 
				-            msg += ": " + str(value)
			
 
				-        self.logger.debug(msg)
			
 
				-    
			
 
				-    def log_epoch_summary(self):
			
 
				-        if self.logging_scope >= Scope.EPOCH:
			
 
				-            summary = self.timestamp_prefix() + ' Epoch {:<4} '.format(str(_data['epoch']) + ':')
			
 
				-            for key, m in _data['metrics'].items():
			
 
				-                if m.metric_scope >= Scope.EPOCH:
			
 
				-                    summary += str(key) + ": " + str(m.get_value()) + ", "
			
 
				-            self.logger.debug(summary)
			
 
				-
			
 
				-    def log_iteration_summary(self):
			
 
				-        if self.logging_scope >= Scope.TRAIN_ITER and _data['total_iteration'] % self.iteration_interval == 0:
			
 
				-            summary = self.timestamp_prefix() + ' Iter {:<5} '.format(str(_data['iteration']) + ':')
			
 
				-            for key, m in _data['metrics'].items():
			
 
				-                if m.metric_scope == Scope.TRAIN_ITER:
			
 
				-                    summary += str(key) + ": " + str(m.get_last()) + ", "
			
 
				-            self.logger.debug(summary)
			
 
				- 
			
 
				-    def timed_block_start(self, name):
			
 
				-        pass
			
 
				-
			
 
				-    def timed_block_stop(self, name):
			
 
				-        pass
			
 
				-
			
 
				-    def finish(self):
			
 
				-        pass
			
 
				-
			
 
				-class _Logger(object):
			
 
				-    def __init__(self):
			
 
				-
			
 
				-        self.backends = [
			
 
				-                CompactBackend(),
			
 
				-                JsonBackend()
			
 
				-                ]
			
 
				-
			
 
				-        self.level = Level.INFO
			
 
				-   
			
 
				-    def set_model_name(self, name):
			
 
				-        _data['model'] = name
			
 
				-
			
 
				-
			
 
				-    def set_backends(self, backends):
			
 
				-        self.backends = backends
			
 
				-        
			
 
				-    def register_metric(self, key, meter=None, metric_scope=Scope.EPOCH):
			
 
				-        if meter is None:
			
 
				-            meter = StandardMeter()
			
 
				-        #TODO: move to argument of Meter?
			
 
				-        meter.metric_scope = metric_scope
			
 
				-        _data['metrics'][key] = meter
			
 
				-        for b in self.backends:
			
 
				-            b.register_metric(key, metric_scope)
			
 
				-
			
 
				-    def log(self, key, value=None, forced=False, level=Level.INFO):
			
 
				-        if level < self.level:
			
 
				-            return
			
 
				-
			
 
				-        if _data['current_scope'] == Scope.TRAIN_ITER or _data['current_scope'] == Scope.EPOCH:
			
 
				-            if key in _data['metrics'].keys():
			
 
				-                if _data['metrics'][key].metric_scope == _data['current_scope']:
			
 
				-                    _data['metrics'][key].record(value)
			
 
				-        for b in self.backends:
			
 
				-            b.log(key, value)
			
 
				-
			
 
				-    def debug(self, *args, **kwargs):
			
 
				-        self.log(*args, level=Level.DEBUG, **kwargs)
			
 
				-
			
 
				-    def info(self, *args, **kwargs):
			
 
				-        self.log(*args, level=Level.INFO, **kwargs)
			
 
				-
			
 
				-    def warning(self, *args, **kwargs):
			
 
				-        self.log(*args, level=Level.WARNING, **kwargs)
			
 
				-
			
 
				-    def error(self, *args, **kwargs):
			
 
				-        self.log(*args, level=Level.ERROR, **kwargs)
			
 
				-
			
 
				-    def critical(self, *args, **kwargs):
			
 
				-        self.log(*args, level=Level.CRITICAL, **kwargs)
			
 
				-
			
 
				-    def log_event(self, key, value=None):
			
 
				-        for b in self.backends:
			
 
				-            b.log_event(key, value)
			
 
				-    
			
 
				-    def timed_block_start(self, name):
			
 
				-        if not name in _data['timed_blocks']:
			
 
				-            _data['timed_blocks'][name] = OrderedDict()
			
 
				-        _data['timed_blocks'][name]['start'] = time.time()
			
 
				-        for b in self.backends:
			
 
				-            b.timed_block_start(name)
			
 
				-    
			
 
				-    def timed_block_stop(self, name):
			
 
				-        if not name in _data['timed_blocks']:
			
 
				-            raise ValueError('timed_block_stop called before timed_block_start for ' + name)
			
 
				-        _data['timed_blocks'][name]['stop'] = time.time()
			
 
				-        delta = _data['timed_blocks'][name]['stop'] - _data['timed_blocks'][name]['start']
			
 
				-        self.log(name + '_time', delta)
			
 
				-        for b in self.backends:
			
 
				-            b.timed_block_stop(name)
			
 
				-
			
 
				-    def iteration_start(self):
			
 
				-        _data['current_scope'] = Scope.TRAIN_ITER
			
 
				-        _data['iteration'] += 1
			
 
				-        _data['total_iteration'] += 1
			
 
				-
			
 
				-
			
 
				-    def iteration_stop(self):
			
 
				-        for b in self.backends:
			
 
				-            b.log_iteration_summary()
			
 
				-        _data['current_scope'] = Scope.EPOCH
			
 
				-
			
 
				-    def epoch_start(self):
			
 
				-        _data['current_scope'] = Scope.EPOCH 
			
 
				-        _data['epoch'] += 1
			
 
				-        _data['iteration'] = -1
			
 
				-
			
 
				-        for n, m in _data['metrics'].items():
			
 
				-            if m.metric_scope == Scope.TRAIN_ITER:
			
 
				-                m.reset()
			
 
				-
			
 
				-    def epoch_stop(self):
			
 
				-        for b in self.backends:
			
 
				-            b.log_epoch_summary()
			
 
				-        _data['current_scope'] = Scope.RUN
			
 
				-
			
 
				-    def finish(self):
			
 
				-        for b in self.backends:
			
 
				-            b.finish()
			
 
				-
			
 
				-    def iteration_generator_wrapper(self, gen):
			
 
				-        for g in gen:
			
 
				-            self.iteration_start()
			
 
				-            yield g
			
 
				-            self.iteration_stop()
			
 
				-
			
 
				-    def epoch_generator_wrapper(self, gen):
			
 
				-        for g in gen:
			
 
				-            self.epoch_start()
			
 
				-            yield g
			
 
				-            self.epoch_stop()
			
 
				-
			
 
				-    @contextmanager
			
 
				-    def timed_block(self, prefix, value=None, forced=False):
			
 
				-        """ This function helps with timed blocks
			
 
				-            ----
			
 
				-            Parameters:
			
 
				-            prefix - one of items from TIMED_BLOCKS; the action to be timed
			
 
				-            logger - NVLogger object
			
 
				-            forced - if True then the events are always logged (even if it should be skipped)
			
 
				-        """
			
 
				-        self.timed_block_start(prefix)
			
 
				-        yield self
			
 
				-        self.timed_block_stop(prefix)
			
 
				-
			
 
				-    def log_hardware(self):
			
 
				-        autologging.log_hardware(self)
			
 
				-
			
 
				-    def log_args(self, args):
			
 
				-        autologging.log_args(self, args)
			
 
				-
			
 
				-    def timed_function(self, prefix, variable=None, forced=False):
			
 
				-        """ This decorator helps with timed functions
			
 
				-            ----
			
 
				-            Parameters:
			
 
				-            prefix - one of items from TIME_BLOCK; the action to be timed
			
 
				-            logger - NVLogger object
			
 
				-            forced - if True then the events are always logged (even if it should be skipped)
			
 
				-        """
			
 
				-
			
 
				-        def timed_function_decorator(func):
			
 
				-            @functools.wraps(func)
			
 
				-            def wrapper(*args, **kwargs):
			
 
				-                value = kwargs.get(variable, next(iter(args), None))
			
 
				-                with self.timed_block(prefix=prefix, value=value, forced=forced):
			
 
				-                    func(*args, **kwargs)
			
 
				-
			
 
				-            return wrapper
			
 
				-
			
 
				-        return timed_function_decorator
			
 
				-
			
 
				-
			
 
				-LOGGER = _Logger()
			
 
				-
			
--- a/TensorFlow/Segmentation/UNet_Industrial/dllogger/dllogger/tags.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/dllogger/dllogger/tags.py
@@ -1,255 +0,0 @@
 
				-# Copyright 2018 MLBenchmark Group. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-# ==============================================================================
			
 
				-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-# Common values reported
			
 
				-
			
 
				-VALUE_EPOCH = "epoch"
			
 
				-VALUE_ITERATION = "iteration"
			
 
				-VALUE_ACCURACY = "accuracy"
			
 
				-VALUE_BLEU = "bleu"
			
 
				-VALUE_TOP1 = "top1"
			
 
				-VALUE_TOP5 = "top5"
			
 
				-VALUE_BBOX_MAP = "bbox_map"
			
 
				-VALUE_MASK_MAP = "mask_map"
			
 
				-VALUE_BCE = "binary_cross_entropy"
			
 
				-
			
 
				-
			
 
				-# Timed blocks (used with timed_function & timed_block
			
 
				-# For each there should be *_start and *_stop tags defined
			
 
				-
			
 
				-RUN_BLOCK = "run"
			
 
				-SETUP_BLOCK = "setup"
			
 
				-PREPROC_BLOCK = "preproc"
			
 
				-
			
 
				-TRAIN_BLOCK = "train"
			
 
				-TRAIN_PREPROC_BLOCK = "train_preproc"
			
 
				-TRAIN_EPOCH_BLOCK = "train_epoch"
			
 
				-TRAIN_EPOCH_PREPROC_BLOCK = "train_epoch_preproc"
			
 
				-TRAIN_CHECKPOINT_BLOCK = "train_checkpoint"
			
 
				-TRAIN_ITER_BLOCK = "train_iteration"
			
 
				-
			
 
				-EVAL_BLOCK = "eval"
			
 
				-EVAL_ITER_BLOCK = "eval_iteration"
			
 
				-
			
 
				-#TODO: to remove?
			
 
				-TIMED_BLOCKS = {
			
 
				-    RUN_BLOCK,
			
 
				-    SETUP_BLOCK,
			
 
				-    PREPROC_BLOCK,
			
 
				-    TRAIN_BLOCK,
			
 
				-    TRAIN_PREPROC_BLOCK,
			
 
				-    TRAIN_EPOCH_BLOCK,
			
 
				-    TRAIN_EPOCH_PREPROC_BLOCK,
			
 
				-    TRAIN_CHECKPOINT_BLOCK,
			
 
				-    TRAIN_ITER_BLOCK,
			
 
				-    EVAL_BLOCK,
			
 
				-    EVAL_ITER_BLOCK,
			
 
				-}
			
 
				-
			
 
				-
			
 
				-# Events
			
 
				-
			
 
				-RUN_INIT = "run_init"
			
 
				-
			
 
				-SETUP_START = "setup_start"
			
 
				-SETUP_STOP = "setup_stop"
			
 
				-
			
 
				-PREPROC_START = "preproc_start"
			
 
				-PREPROC_STOP = "preproc_stop"
			
 
				-
			
 
				-RUN_START = "run_start"
			
 
				-RUN_STOP = "run_stop"
			
 
				-RUN_FINAL = "run_final"
			
 
				-
			
 
				-TRAIN_CHECKPOINT_START = "train_checkpoint_start"
			
 
				-TRAIN_CHECKPOINT_STOP = "train_checkpoint_stop"
			
 
				-
			
 
				-TRAIN_PREPROC_START = "train_preproc_start"
			
 
				-TRAIN_PREPROC_STOP = "train_preproc_stop"
			
 
				-
			
 
				-TRAIN_EPOCH_PREPROC_START = "train_epoch_preproc_start"
			
 
				-TRAIN_EPOCH_PREPROC_STOP = "train_epoch_preproc_stop"
			
 
				-
			
 
				-TRAIN_ITER_START = "train_iter_start"
			
 
				-TRAIN_ITER_STOP = "train_iter_stop"
			
 
				-
			
 
				-TRAIN_EPOCH_START = "train_epoch_start"
			
 
				-TRAIN_EPOCH_STOP = "train_epoch_stop"
			
 
				-
			
 
				-
			
 
				-# MLPerf specific tags
			
 
				-
			
 
				-RUN_CLEAR_CACHES = "run_clear_caches"
			
 
				-
			
 
				-PREPROC_NUM_TRAIN_EXAMPLES = "preproc_num_train_examples"
			
 
				-PREPROC_NUM_EVAL_EXAMPLES = "preproc_num_eval_examples"
			
 
				-PREPROC_TOKENIZE_TRAINING = "preproc_tokenize_training"
			
 
				-PREPROC_TOKENIZE_EVAL = "preproc_tokenize_eval"
			
 
				-PREPROC_VOCAB_SIZE = "preproc_vocab_size"
			
 
				-
			
 
				-RUN_SET_RANDOM_SEED = "run_set_random_seed"
			
 
				-
			
 
				-INPUT_SIZE = "input_size"
			
 
				-INPUT_BATCH_SIZE = "input_batch_size"
			
 
				-INPUT_ORDER = "input_order"
			
 
				-INPUT_SHARD = "input_shard"
			
 
				-INPUT_BN_SPAN = "input_bn_span"
			
 
				-
			
 
				-INPUT_CENTRAL_CROP = "input_central_crop"
			
 
				-INPUT_CROP_USES_BBOXES = "input_crop_uses_bboxes"
			
 
				-INPUT_DISTORTED_CROP_MIN_OBJ_COV = "input_distorted_crop_min_object_covered"
			
 
				-INPUT_DISTORTED_CROP_RATIO_RANGE = "input_distorted_crop_aspect_ratio_range"
			
 
				-INPUT_DISTORTED_CROP_AREA_RANGE = "input_distorted_crop_area_range"
			
 
				-INPUT_DISTORTED_CROP_MAX_ATTEMPTS = "input_distorted_crop_max_attempts"
			
 
				-INPUT_MEAN_SUBTRACTION = "input_mean_subtraction"
			
 
				-INPUT_RANDOM_FLIP = "input_random_flip"
			
 
				-
			
 
				-INPUT_RESIZE = "input_resize"
			
 
				-INPUT_RESIZE_ASPECT_PRESERVING = "input_resize_aspect_preserving"
			
 
				-
			
 
				-
			
 
				-# Opt
			
 
				-
			
 
				-OPT_NAME = "opt_name"
			
 
				-
			
 
				-OPT_LR = "opt_learning_rate"
			
 
				-OPT_MOMENTUM = "opt_momentum"
			
 
				-
			
 
				-OPT_WEIGHT_DECAY = "opt_weight_decay"
			
 
				-
			
 
				-OPT_HP_ADAM_BETA1 = "opt_hp_Adam_beta1"
			
 
				-OPT_HP_ADAM_BETA2 = "opt_hp_Adam_beta2"
			
 
				-OPT_HP_ADAM_EPSILON = "opt_hp_Adam_epsilon"
			
 
				-
			
 
				-OPT_LR_WARMUP_STEPS = "opt_learning_rate_warmup_steps"
			
 
				-
			
 
				-
			
 
				-#  Train
			
 
				-
			
 
				-TRAIN_LOOP = "train_loop"
			
 
				-TRAIN_EPOCH = "train_epoch"
			
 
				-TRAIN_CHECKPOINT = "train_checkpoint"
			
 
				-TRAIN_LOSS = "train_loss"
			
 
				-TRAIN_ITERATION_LOSS = "train_iteration_loss"
			
 
				-
			
 
				-
			
 
				-# Eval
			
 
				-
			
 
				-EVAL_START = "eval_start"
			
 
				-EVAL_SIZE = "eval_size"
			
 
				-EVAL_TARGET = "eval_target"
			
 
				-EVAL_ACCURACY = "eval_accuracy"
			
 
				-EVAL_STOP = "eval_stop"
			
 
				-
			
 
				-
			
 
				-# Perf
			
 
				-
			
 
				-PERF_IT_PER_SEC = "perf_it_per_sec"
			
 
				-PERF_TIME_TO_TRAIN = "time_to_train"
			
 
				-
			
 
				-EVAL_ITERATION_ACCURACY = "eval_iteration_accuracy"
			
 
				-
			
 
				-
			
 
				-# Model
			
 
				-
			
 
				-MODEL_HP_LOSS_FN = "model_hp_loss_fn"
			
 
				-
			
 
				-MODEL_HP_INITIAL_SHAPE = "model_hp_initial_shape"
			
 
				-MODEL_HP_FINAL_SHAPE = "model_hp_final_shape"
			
 
				-
			
 
				-MODEL_L2_REGULARIZATION = "model_l2_regularization"
			
 
				-MODEL_EXCLUDE_BN_FROM_L2 = "model_exclude_bn_from_l2"
			
 
				-
			
 
				-MODEL_HP_RELU = "model_hp_relu"
			
 
				-MODEL_HP_CONV2D_FIXED_PADDING = "model_hp_conv2d_fixed_padding"
			
 
				-MODEL_HP_BATCH_NORM = "model_hp_batch_norm"
			
 
				-MODEL_HP_DENSE = "model_hp_dense"
			
 
				-
			
 
				-
			
 
				-# GNMT specific
			
 
				-
			
 
				-MODEL_HP_LOSS_SMOOTHING = "model_hp_loss_smoothing"
			
 
				-MODEL_HP_NUM_LAYERS = "model_hp_num_layers"
			
 
				-MODEL_HP_HIDDEN_SIZE = "model_hp_hidden_size"
			
 
				-MODEL_HP_DROPOUT = "model_hp_dropout"
			
 
				-
			
 
				-EVAL_HP_BEAM_SIZE = "eval_hp_beam_size"
			
 
				-TRAIN_HP_MAX_SEQ_LEN = "train_hp_max_sequence_length"
			
 
				-EVAL_HP_MAX_SEQ_LEN = "eval_hp_max_sequence_length"
			
 
				-EVAL_HP_LEN_NORM_CONST = "eval_hp_length_normalization_constant"
			
 
				-EVAL_HP_LEN_NORM_FACTOR = "eval_hp_length_normalization_factor"
			
 
				-EVAL_HP_COV_PENALTY_FACTOR = "eval_hp_coverage_penalty_factor"
			
 
				-
			
 
				-
			
 
				-# NCF specific
			
 
				-
			
 
				-PREPROC_HP_MIN_RATINGS = "preproc_hp_min_ratings"
			
 
				-PREPROC_HP_NUM_EVAL = "preproc_hp_num_eval"
			
 
				-PREPROC_HP_SAMPLE_EVAL_REPLACEMENT = "preproc_hp_sample_eval_replacement"
			
 
				-
			
 
				-INPUT_HP_NUM_NEG = "input_hp_num_neg"
			
 
				-INPUT_HP_SAMPLE_TRAIN_REPLACEMENT = "input_hp_sample_train_replacement"
			
 
				-INPUT_STEP_TRAIN_NEG_GEN = "input_step_train_neg_gen"
			
 
				-INPUT_STEP_EVAL_NEG_GEN = "input_step_eval_neg_gen"
			
 
				-
			
 
				-EVAL_HP_NUM_USERS = "eval_hp_num_users"
			
 
				-EVAL_HP_NUM_NEG = "eval_hp_num_neg"
			
 
				-
			
 
				-MODEL_HP_MF_DIM = "model_hp_mf_dim"
			
 
				-MODEL_HP_MLP_LAYER_SIZES = "model_hp_mlp_layer_sizes"
			
 
				-
			
 
				-
			
 
				-# RESNET specific
			
 
				-
			
 
				-EVAL_EPOCH_OFFSET = "eval_offset"
			
 
				-
			
 
				-MODEL_HP_INITIAL_MAX_POOL = "model_hp_initial_max_pool"
			
 
				-MODEL_HP_BEGIN_BLOCK = "model_hp_begin_block"
			
 
				-MODEL_HP_END_BLOCK = "model_hp_end_block"
			
 
				-MODEL_HP_BLOCK_TYPE = "model_hp_block_type"
			
 
				-MODEL_HP_PROJECTION_SHORTCUT = "model_hp_projection_shortcut"
			
 
				-MODEL_HP_SHORTCUT_ADD = "model_hp_shorcut_add"
			
 
				-MODEL_HP_RESNET_TOPOLOGY = "model_hp_resnet_topology"
			
 
				-
			
 
				-
			
 
				-# Transformer specific
			
 
				-
			
 
				-INPUT_MAX_LENGTH = "input_max_length"
			
 
				-
			
 
				-MODEL_HP_INITIALIZER_GAIN = "model_hp_initializer_gain"
			
 
				-MODEL_HP_VOCAB_SIZE = "model_hp_vocab_size"
			
 
				-MODEL_HP_NUM_HIDDEN_LAYERS = "model_hp_hidden_layers"
			
 
				-MODEL_HP_EMBEDDING_SHARED_WEIGHTS = "model_hp_embedding_shared_weights"
			
 
				-MODEL_HP_ATTENTION_DENSE = "model_hp_attention_dense"
			
 
				-MODEL_HP_ATTENTION_DROPOUT = "model_hp_attention_dropout"
			
 
				-MODEL_HP_FFN_OUTPUT_DENSE = "model_hp_ffn_output_dense"
			
 
				-MODEL_HP_FFN_FILTER_DENSE = "model_hp_ffn_filter_dense"
			
 
				-MODEL_HP_RELU_DROPOUT = "model_hp_relu_dropout"
			
 
				-MODEL_HP_LAYER_POSTPROCESS_DROPOUT = "model_hp_layer_postprocess_dropout"
			
 
				-MODEL_HP_NORM = "model_hp_norm"
			
 
				-MODEL_HP_SEQ_BEAM_SEARCH = "model_hp_sequence_beam_search"
			
 
				-
			
--- a/TensorFlow/Segmentation/UNet_Industrial/dllogger/dummy_run.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/dllogger/dummy_run.py
@@ -1,151 +0,0 @@
 
				-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-
			
 
				-from dllogger import LOGGER, CompactBackend, StdOutBackend, MLPerfBackend, JsonBackend, Scope, AverageMeter, tags
			
 
				-from argparse import ArgumentParser
			
 
				-import random
			
 
				-
			
 
				[email protected]_function("train")
			
 
				-def train():
			
 
				-    for i in range(0, 10):
			
 
				-    #for i in LOGGER.epoch_generator_wrapper(range(0, 10)):
			
 
				-        LOGGER.epoch_start()
			
 
				-        LOGGER.log("epoch_nr", i)
			
 
				-        LOGGER.log("epochs2", 2 * i)
			
 
				-        train_epoch(i)
			
 
				-        LOGGER.epoch_stop()
			
 
				-
			
 
				-
			
 
				[email protected]_function("train_epoch", "epoch")
			
 
				-def train_epoch(epoch):
			
 
				-    for i in range(epoch*30, (epoch+1)*30, 2):
			
 
				-    #for i in LOGGER.iteration_generator_wrapper(range(epoch*10, (epoch+1)*10, 2)):
			
 
				-        LOGGER.iteration_start()
			
 
				-        LOGGER.log("loss", i*epoch)
			
 
				-        LOGGER.iteration_stop()
			
 
				-    if epoch % 3 == 1:
			
 
				-        with LOGGER.timed_block("eval"):
			
 
				-            LOGGER.log("accuracy", i * epoch)
			
 
				-            LOGGER.log_event(key="ep divisible by 3", value=epoch)
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    LOGGER.set_model_name('ResNet')
			
 
				-    LOGGER.set_backends([
			
 
				-            StdOutBackend(log_file='std.out',
			
 
				-                logging_scope=Scope.TRAIN_ITER),
			
 
				-            CompactBackend(log_file=None,
			
 
				-                logging_scope=Scope.TRAIN_ITER, iteration_interval=5),
			
 
				-            JsonBackend(log_file='dummy.json',
			
 
				-                logging_scope=Scope.TRAIN_ITER, iteration_interval=4)
			
 
				-            ])
			
 
				-
			
 
				-    parser = ArgumentParser()
			
 
				-    parser.add_argument('--dummy', type=str, default='default_dummy_value')
			
 
				-    args = parser.parse_args()
			
 
				-
			
 
				-    LOGGER.log_hardware()
			
 
				-    LOGGER.log_args(args)
			
 
				-
			
 
				-    LOGGER.log(tags.RUN_INIT)
			
 
				-    LOGGER.register_metric('loss', meter=AverageMeter(), metric_scope=Scope.TRAIN_ITER)
			
 
				-    LOGGER.register_metric('epoch_nr', metric_scope=Scope.EPOCH)
			
 
				-    LOGGER.register_metric('epochs2')
			
 
				-
			
 
				-    with LOGGER.timed_block(tags.SETUP_BLOCK):
			
 
				-        print("This is setup.")
			
 
				-
			
 
				-    with LOGGER.timed_block(tags.PREPROC_BLOCK):
			
 
				-        print("This is preprocessing.")
			
 
				-
			
 
				-    with LOGGER.timed_block(tags.RUN_BLOCK):
			
 
				-        print("This is run.")
			
 
				-        train()
			
 
				-        print("This is the end.")
			
 
				-
			
 
				-    LOGGER.log(tags.RUN_FINAL)
			
 
				-
			
 
				-    LOGGER.finish()
			
 
				-
			
 
				-def main2():
			
 
				-    LOGGER.set_backends([
			
 
				-            CompactBackend(log_file=None,
			
 
				-                logging_scope=Scope.TRAIN_ITER),
			
 
				-            StdOutBackend(log_file='std.out',
			
 
				-                logging_scope=Scope.EPOCH, iteration_interval=4),
			
 
				-            JsonBackend(log_file='dummy.json',
			
 
				-                logging_scope=Scope.TRAIN_ITER, iteration_interval=1)
			
 
				-            ])
			
 
				-    LOGGER.log_hardware()
			
 
				-
			
 
				-    data_x = range(0,10)
			
 
				-    data_y = [3.*x + 2. for x in data_x]
			
 
				-
			
 
				-    data = list(zip(data_x, data_y))
			
 
				-
			
 
				-    LOGGER.register_metric('l', AverageMeter(), metric_scope=Scope.TRAIN_ITER)
			
 
				-    LOGGER.register_metric('a', metric_scope=Scope.TRAIN_ITER)
			
 
				-    LOGGER.register_metric('b', metric_scope=Scope.TRAIN_ITER)
			
 
				-
			
 
				-    LOGGER.info('RUN_INIT')
			
 
				-
			
 
				-    model_a = 1.
			
 
				-    model_b = 0.
			
 
				-
			
 
				-    def model(ma, mb, x):
			
 
				-        return ma*x+mb
			
 
				-
			
 
				-    def loss(y, t):
			
 
				-        return (y-t)**2
			
 
				-
			
 
				-    def update_a(ma, mb, x, t):
			
 
				-        return ma - 0.001 * 2*x*(ma*x+mb-t)
			
 
				-
			
 
				-    def update_b(ma, mb, x, t):
			
 
				-        return mb - 0.001 * 2*(ma*x+mb-t)
			
 
				-
			
 
				-    for e in range(0, 5):
			
 
				-        LOGGER.epoch_start()
			
 
				-        for (x, t) in data:
			
 
				-            LOGGER.iteration_start()
			
 
				-            y = model(model_a, model_b, x)
			
 
				-            model_a = update_a(model_a, model_b, x, t)
			
 
				-            model_b = update_b(model_a, model_b, x, t)
			
 
				-            l = loss(y, t)
			
 
				-            LOGGER.info('b', model_b)
			
 
				-            LOGGER.debug('a', model_a)
			
 
				-            LOGGER.warning('l', l)
			
 
				-            #LOGGER.log('a', model_a)
			
 
				-            LOGGER.iteration_stop()
			
 
				-        LOGGER.epoch_stop()
			
 
				-
			
 
				-    #for e in LOGGER.epoch_generator_wrapper(range(0, 10)):
			
 
				-    #    for (x, t) in LOGGER.iteration_generator_wrapper(random.sample(data, len(data))):
			
 
				-    #        y = model(model_a, model_b, x)
			
 
				-    #        model_a = update_a(model_a, model_b, x, t)
			
 
				-    #        model_b = update_b(model_a, model_b, x, t)
			
 
				-    #        l = loss(y, t)
			
 
				-    #        LOGGER.debug('a', model_a)
			
 
				-    #        LOGGER.info('b', model_b)
			
 
				-    #        LOGGER.warning('l', l)
			
 
				-
			
 
				-
			
 
				-    LOGGER.finish()
			
 
				-
			
 
				-    print("FINAL: {}*x+{}".format(model_a, model_b))
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/TensorFlow/Segmentation/UNet_Industrial/dllogger/setup.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/dllogger/setup.py
@@ -1,37 +0,0 @@
 
				-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-import setuptools
			
 
				-
			
 
				-with open("README.md", "r") as f:
			
 
				-  long_description = f.read()
			
 
				-
			
 
				-setuptools.setup(
			
 
				-    name="DLLogger",
			
 
				-    version="0.3.1",
			
 
				-    author="Lukasz Mazurek",
			
 
				-    author_email="[email protected]",
			
 
				-    description="Tools for logging DL training.",
			
 
				-    long_description=long_description,
			
 
				-    long_description_content_type="text/markdown",
			
 
				-    url="https://github.com/nvlmazurek/DLLogger",
			
 
				-    packages=['dllogger'],
			
 
				-    classifiers=[
			
 
				-      "Programming Language :: Python :: 2",
			
 
				-      "Programming Language :: Python :: 3",
			
 
				-      "License :: BSD",
			
 
				-      "Operating System :: OS Independent",
			
 
				-    ],
			
 
				-    license="BSD",
			
 
				-)
			
--- a/TensorFlow/Segmentation/UNet_Industrial/main.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/main.py
@@ -22,6 +22,7 @@
 
				 import os
			
 
				 
			
 
				 import warnings
			
 
				+
			
 
				 warnings.simplefilter("ignore")
			
 
				 
			
 
				 import tensorflow as tf
			
@@ -32,12 +33,13 @@ from utils import hvd_utils
 
				 from runtime import Runner
			
 
				 
			
 
				 from utils.cmdline_helper import parse_cmdline
			
 
				+from utils.logging import init_dllogger
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				 
			
 
				     tf.logging.set_verbosity(tf.logging.ERROR)
			
 
				-
			
 
				     FLAGS = parse_cmdline()
			
 
				+    init_dllogger(FLAGS.log_dir)
			
 
				 
			
 
				     RUNNING_CONFIG = tf.contrib.training.HParams(
			
 
				         exec_mode=FLAGS.exec_mode,
			
@@ -130,7 +132,6 @@ if __name__ == "__main__":
 
				     )
			
 
				 
			
 
				     if RUNNING_CONFIG.exec_mode in ["train", "train_and_evaluate", "training_benchmark"]:
			
 
				-
			
 
				         runner.train(
			
 
				             iter_unit=RUNNING_CONFIG.iter_unit,
			
 
				             num_iter=RUNNING_CONFIG.num_iter,
			
@@ -147,18 +148,12 @@ if __name__ == "__main__":
 
				             is_benchmark=RUNNING_CONFIG.exec_mode == 'training_benchmark'
			
 
				         )
			
 
				 
			
 
				-    if RUNNING_CONFIG.exec_mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']:
			
 
				-
			
 
				-        if RUNNING_CONFIG.exec_mode == 'inference_benchmark' and hvd_utils.is_using_hvd():
			
 
				-            raise NotImplementedError("Only single GPU inference is implemented.")
			
 
				-
			
 
				-        elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				-
			
 
				-            runner.evaluate(
			
 
				-                iter_unit=RUNNING_CONFIG.iter_unit if RUNNING_CONFIG.exec_mode != "train_and_evaluate" else "epoch",
			
 
				-                num_iter=RUNNING_CONFIG.num_iter if RUNNING_CONFIG.exec_mode != "train_and_evaluate" else 1,
			
 
				-                warmup_steps=RUNNING_CONFIG.warmup_steps,
			
 
				-                batch_size=RUNNING_CONFIG.batch_size,
			
 
				-                is_benchmark=RUNNING_CONFIG.exec_mode == 'inference_benchmark',
			
 
				-                save_eval_results_to_json=RUNNING_CONFIG.save_eval_results_to_json
			
 
				-            )
			
 
				+    if RUNNING_CONFIG.exec_mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark'] and hvd.rank() == 0:
			
 
				+        runner.evaluate(
			
 
				+            iter_unit=RUNNING_CONFIG.iter_unit if RUNNING_CONFIG.exec_mode != "train_and_evaluate" else "epoch",
			
 
				+            num_iter=RUNNING_CONFIG.num_iter if RUNNING_CONFIG.exec_mode != "train_and_evaluate" else 1,
			
 
				+            warmup_steps=RUNNING_CONFIG.warmup_steps,
			
 
				+            batch_size=RUNNING_CONFIG.batch_size,
			
 
				+            is_benchmark=RUNNING_CONFIG.exec_mode == 'inference_benchmark',
			
 
				+            save_eval_results_to_json=RUNNING_CONFIG.save_eval_results_to_json
			
 
				+        )
			
--- a/TensorFlow/Segmentation/UNet_Industrial/model/layers/utils.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/model/layers/utils.py
@@ -19,8 +19,6 @@
 
				 #
			
 
				 # ==============================================================================
			
 
				 
			
 
				-from dllogger.logger import LOGGER
			
 
				-
			
 
				 import horovod.tensorflow as hvd
			
 
				 
			
 
				 from utils import hvd_utils
			
@@ -37,5 +35,5 @@ def _log_hparams(classname, layername, **kwargs):
 
				 
			
 
				     log_msg += "\n"
			
 
				 
			
 
				-    if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
			
 
				-        LOGGER.log(log_msg)
			
 
				+    if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				+        print(log_msg)
			
--- a/TensorFlow/Segmentation/UNet_Industrial/model/unet.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/model/unet.py
@@ -32,7 +32,7 @@ from utils import metrics
 
				 
			
 
				 from utils import image_processing
			
 
				 
			
 
				-from dllogger.logger import LOGGER
			
 
				+from dllogger import Logger
			
 
				 
			
 
				 __all__ = ["UNet_v1"]
			
 
				 
			
@@ -215,6 +215,12 @@ class UNet_v1(object):
 
				             labels = tf.cast(labels, tf.float32)
			
 
				             labels_preds = tf.reduce_max(y_pred, axis=(1, 2, 3))
			
 
				 
			
 
				+            assert (
			
 
				+                abs(labels_preds - tf.clip_by_value(labels_preds, 0, 1)) < 0.00001,
			
 
				+                    "Clipping labels_preds introduces non-trivial loss."
			
 
				+            )
			
 
				+            labels_preds = tf.clip_by_value(labels_preds, 0, 1)
			
 
				+
			
 
				             with tf.variable_scope("Confusion_Matrix") as scope:
			
 
				 
			
 
				                 tp, update_tp = tf.metrics.true_positives_at_thresholds(
			
@@ -380,8 +386,8 @@ class UNet_v1(object):
 
				 
			
 
				                     if params["apply_manual_loss_scaling"]:
			
 
				 
			
 
				-                        if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
			
 
				-                            LOGGER.log("Applying manual Loss Scaling ...")
			
 
				+                        # if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				+                        #     Logger.log("Applying manual Loss Scaling ...")
			
 
				 
			
 
				                         loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(
			
 
				                             init_loss_scale=2**32,  # 4,294,967,296
			
--- a/TensorFlow/Segmentation/UNet_Industrial/requirements.txt
+++ b/TensorFlow/Segmentation/UNet_Industrial/requirements.txt
@@ -0,0 +1 @@
 
				+git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc
			
--- a/TensorFlow/Segmentation/UNet_Industrial/runtime/runner.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/runtime/runner.py
@@ -19,8 +19,6 @@
 
				 #
			
 
				 # ==============================================================================
			
 
				 
			
 
				-from __future__ import print_function
			
 
				-
			
 
				 import os
			
 
				 import json
			
 
				 import multiprocessing
			
@@ -40,8 +38,7 @@ from utils import hvd_utils
 
				 
			
 
				 from utils.hooks import ProfilerHook
			
 
				 
			
 
				-from dllogger.logger import LOGGER
			
 
				-import dllogger.logger as dllg
			
 
				+import dllogger as Logger
			
 
				 
			
 
				 __all__ = [
			
 
				     'Runner',
			
@@ -101,26 +98,11 @@ class Runner(object):
 
				         if data_dir is not None and not os.path.exists(data_dir):
			
 
				             raise ValueError("The `data_dir` received does not exists: %s" % data_dir)
			
 
				 
			
 
				-        LOGGER.set_model_name('UNet_TF')
			
 
				-
			
 
				-        LOGGER.set_backends(
			
 
				-            [
			
 
				-                dllg.JsonBackend(
			
 
				-                    log_file=os.path.join(model_dir, 'dlloger_out.json'),
			
 
				-                    logging_scope=dllg.Scope.TRAIN_ITER,
			
 
				-                    iteration_interval=log_every_n_steps
			
 
				-                ),
			
 
				-                dllg.StdOutBackend(
			
 
				-                    log_file=None, logging_scope=dllg.Scope.TRAIN_ITER, iteration_interval=log_every_n_steps
			
 
				-                )
			
 
				-            ]
			
 
				-        )
			
 
				-
			
 
				         if hvd_utils.is_using_hvd():
			
 
				             hvd.init()
			
 
				 
			
 
				-            if hvd.local_rank() == 0:
			
 
				-                LOGGER.log("Horovod successfully initialized ...")
			
 
				+            if hvd.rank() == 0:
			
 
				+                print("Horovod successfully initialized ...")
			
 
				 
			
 
				             tf_seed = 2 * (seed + hvd.rank()) if seed is not None else None
			
 
				 
			
@@ -135,10 +117,9 @@ class Runner(object):
 
				 
			
 
				         os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
			
 
				 
			
 
				-        # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
			
 
				-
			
 
				         os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
			
 
				         os.environ['TF_GPU_THREAD_COUNT'] = '1' if not hvd_utils.is_using_hvd() else str(hvd.size())
			
 
				+        print("WORLD_SIZE", hvd.size())
			
 
				 
			
 
				         os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
			
 
				 
			
@@ -148,7 +129,6 @@ class Runner(object):
 
				 
			
 
				         os.environ['TF_SYNC_ON_FINISH'] = '0'
			
 
				         os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
			
 
				-        # os.environ['TF_DISABLE_NVTX_RANGES'] = '1' 
			
 
				 
			
 
				         # =================================================
			
 
				 
			
@@ -156,8 +136,8 @@ class Runner(object):
 
				 
			
 
				         if use_tf_amp:
			
 
				 
			
 
				-            if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
			
 
				-                LOGGER.log("TF AMP is activated - Experimental Feature")
			
 
				+            if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				+                print("TF AMP is activated - Experimental Feature")
			
 
				 
			
 
				             os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
			
 
				 
			
@@ -205,8 +185,8 @@ class Runner(object):
 
				 
			
 
				         self.run_hparams = Runner._build_hparams(model_hparams, run_config_additional)
			
 
				 
			
 
				-        if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
			
 
				-            LOGGER.log('Defining Model Estimator ...\n')
			
 
				+        if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				+            print('Defining Model Estimator ...\n')
			
 
				 
			
 
				         self._model = UNet_v1(
			
 
				             model_name="UNet_v1",
			
@@ -220,8 +200,8 @@ class Runner(object):
 
				 
			
 
				         if self.run_hparams.seed is not None:
			
 
				 
			
 
				-            if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
			
 
				-                LOGGER.log("Deterministic Run - Seed: %d\n" % seed)
			
 
				+            if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				+                print("Deterministic Run - Seed: %d\n" % seed)
			
 
				 
			
 
				             tf.set_random_seed(self.run_hparams.seed)
			
 
				             np.random.seed(self.run_hparams.seed)
			
@@ -250,7 +230,7 @@ class Runner(object):
 
				                     hparams.add_hparam(name=key, value=val)
			
 
				 
			
 
				                 except ValueError:
			
 
				-                    LOGGER.log(
			
 
				+                    print(
			
 
				                         "the parameter `{}` already exists - existing value: {} and duplicated value: {}".format(
			
 
				                             key, hparams.get(key), val
			
 
				                         )
			
@@ -278,13 +258,13 @@ class Runner(object):
 
				         config.log_device_placement = False
			
 
				 
			
 
				         config.gpu_options.allow_growth = True
			
 
				-        # config.gpu_options.per_process_gpu_memory_fraction=0.7
			
 
				 
			
 
				         if hvd_utils.is_using_hvd():
			
 
				-            config.gpu_options.visible_device_list = str(hvd.local_rank())
			
 
				+            config.gpu_options.visible_device_list = str(hvd.rank())
			
 
				 
			
 
				-        if use_xla:  # Only working on single GPU
			
 
				-            LOGGER.log("XLA is activated - Experimental Feature")
			
 
				+        if use_xla:
			
 
				+            if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				+                print("XLA is activated - Experimental Feature")
			
 
				             config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
			
 
				 
			
 
				         config.gpu_options.force_gpu_compatible = True  # Force pinned memory
			
@@ -382,8 +362,8 @@ class Runner(object):
 
				         if self.run_hparams.use_tf_amp:
			
 
				             if use_auto_loss_scaling:
			
 
				 
			
 
				-                if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
			
 
				-                    LOGGER.log("TF Loss Auto Scaling is activated - Experimental Feature")
			
 
				+                if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				+                    print("TF Loss Auto Scaling is activated - Experimental Feature")
			
 
				 
			
 
				                 os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1"
			
 
				                 apply_manual_loss_scaling = False
			
@@ -394,9 +374,6 @@ class Runner(object):
 
				         else:
			
 
				             apply_manual_loss_scaling = False
			
 
				 
			
 
				-        if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
			
 
				-            LOGGER.log('Defining Model Estimator ...\n')
			
 
				-
			
 
				         global_batch_size = batch_size * self.num_gpus
			
 
				 
			
 
				         if self.run_hparams.data_dir is not None:
			
@@ -416,7 +393,7 @@ class Runner(object):
 
				         if hvd_utils.is_using_hvd():
			
 
				             training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
			
 
				 
			
 
				-        if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
			
 
				+        if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				             training_hooks.append(
			
 
				                 ProfilerHook(
			
 
				                     global_batch_size=global_batch_size,
			
@@ -427,26 +404,28 @@ class Runner(object):
 
				                 )
			
 
				             )
			
 
				 
			
 
				-            LOGGER.log('Starting Model Training ...\n')
			
 
				+            print("Starting Model Training ...")
			
 
				 
			
 
				-            LOGGER.log("=> Epochs: %d" % num_epochs)
			
 
				-            LOGGER.log("=> Total Steps: %d" % num_steps)
			
 
				-            LOGGER.log("=> Steps per Epoch: %d" % steps_per_epoch)
			
 
				-            LOGGER.log("=> Weight Decay Factor: %.1e" % weight_decay)
			
 
				-            LOGGER.log("=> Learning Rate: %.1e" % learning_rate)
			
 
				-            LOGGER.log("=> Learning Rate Decay Factor: %.2f" % learning_rate_decay_factor)
			
 
				-            LOGGER.log("=> Learning Rate Decay Steps: %d" % learning_rate_decay_steps)
			
 
				-            LOGGER.log("=> RMSProp - Decay: %.1f" % rmsprop_decay)
			
 
				-            LOGGER.log("=> RMSProp - Momentum: %.1f" % rmsprop_momentum)
			
 
				-            LOGGER.log("=> Loss Function Name: %s" % self.run_hparams.loss_fn_name)
			
 
				+            Logger.log(step=(), data={"Epochs": num_epochs}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"Total Steps": num_steps}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"Steps per Epoch": steps_per_epoch}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"Weight Decay Factor": weight_decay}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"Learning Rate": learning_rate}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"Learning Rate Decay Factor": learning_rate_decay_factor}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"Learning Rate Decay Steps": learning_rate_decay_steps}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"RMSProp - Decay": rmsprop_decay}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"RMSProp - Momentum": rmsprop_momentum}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"Loss Function Name": self.run_hparams.loss_fn_name}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				 
			
 
				             if self.run_hparams.use_tf_amp:
			
 
				-                LOGGER.log("=> Use Auto Loss Scaling: %s" % use_auto_loss_scaling)
			
 
				+                Logger.log(step=(), data={"Use Auto Loss Scaling": use_auto_loss_scaling}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+
			
 
				+            Logger.log(step=(), data={"# GPUs": self.num_gpus}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"GPU Batch Size": batch_size}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"Global Batch Size": global_batch_size}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+            Logger.log(step=(), data={"Total Files to be Processed": num_steps * global_batch_size}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				 
			
 
				-            LOGGER.log("=> # GPUs: %d" % self.num_gpus)
			
 
				-            LOGGER.log("=> GPU Batch Size: %d" % batch_size)
			
 
				-            LOGGER.log("=> Global Batch Size: %d" % global_batch_size)
			
 
				-            LOGGER.log("=> Total Files to Processed: %d\n" % (num_steps * global_batch_size))
			
 
				+            print()  # visual spacing
			
 
				 
			
 
				         estimator_params = {
			
 
				             'batch_size': batch_size,
			
@@ -480,8 +459,8 @@ class Runner(object):
 
				                 )
			
 
				 
			
 
				             else:
			
 
				-                if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
			
 
				-                    LOGGER.log("Using Synthetic Data ...")
			
 
				+                if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				+                    print("Using Synthetic Data ...")
			
 
				 
			
 
				                 return self.dataset.synth_dataset_fn(
			
 
				                     batch_size=batch_size,
			
@@ -507,8 +486,8 @@ class Runner(object):
 
				         except KeyboardInterrupt:
			
 
				             print("Keyboard interrupt")
			
 
				 
			
 
				-        if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
			
 
				-            LOGGER.log('Ending Model Training ...')
			
 
				+        if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
			
 
				+            print('Ending Model Training ...')
			
 
				 
			
 
				     def evaluate(self, iter_unit, num_iter, batch_size, warmup_steps=50, is_benchmark=False, save_eval_results_to_json=False):
			
 
				 
			
@@ -518,10 +497,10 @@ class Runner(object):
 
				         if self.run_hparams.data_dir is None and not is_benchmark:
			
 
				             raise ValueError('`data_dir` must be specified for evaluation!')
			
 
				 
			
 
				-        if hvd_utils.is_using_hvd() and hvd.rank() != 0:
			
 
				-            raise RuntimeError('Multi-GPU inference is not supported')
			
 
				+        # if hvd_utils.is_using_hvd() and hvd.rank() != 0:
			
 
				+        #     raise RuntimeError('Multi-GPU inference is not supported')
			
 
				 
			
 
				-        LOGGER.log('Defining Model Estimator ...\n')
			
 
				+        print('Defining Model Estimator ...\n')
			
 
				 
			
 
				         if self.run_hparams.data_dir is not None:
			
 
				             filenames, num_samples, num_steps, num_epochs = self.dataset.get_dataset_runtime_specs(
			
@@ -545,13 +524,15 @@ class Runner(object):
 
				             )
			
 
				         ]
			
 
				 
			
 
				-        LOGGER.log('Starting Model Evaluation ...\n')
			
 
				+        print('Starting Model Evaluation ...\n')
			
 
				+
			
 
				+        Logger.log(step=(), data={"Epochs": num_epochs}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+        Logger.log(step=(), data={"Total Steps": num_steps}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+        Logger.log(step=(), data={"Steps per Epoch": steps_per_epoch}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+        Logger.log(step=(), data={"GPU Batch Size": batch_size}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				+        Logger.log(step=(), data={"Total Files to Processed": num_steps * batch_size}, verbosity=Logger.Verbosity.DEFAULT)
			
 
				 
			
 
				-        LOGGER.log("=> Epochs: %d" % num_epochs)
			
 
				-        LOGGER.log("=> Total Steps: %d" % num_steps)
			
 
				-        LOGGER.log("=> Steps per Epoch: %d" % steps_per_epoch)
			
 
				-        LOGGER.log("=> GPU Batch Size: %d" % batch_size)
			
 
				-        LOGGER.log("=> Total Files to Processed: %d\n" % (num_steps * batch_size))
			
 
				+        print()  # visual spacing
			
 
				 
			
 
				         estimator_params = {
			
 
				             'batch_size': batch_size,
			
@@ -578,7 +559,7 @@ class Runner(object):
 
				                 )
			
 
				 
			
 
				             else:
			
 
				-                LOGGER.log("Using Synthetic Data ...")
			
 
				+                print("Using Synthetic Data ...")
			
 
				 
			
 
				                 return self.dataset.synth_dataset_fn(
			
 
				                     batch_size=batch_size,
			
@@ -602,16 +583,20 @@ class Runner(object):
 
				                 hooks=evaluation_hooks,
			
 
				             )
			
 
				 
			
 
				-            LOGGER.log('Ending Model Evaluation ...')
			
 
				+            print('Ending Model Evaluation ...')
			
 
				 
			
 
				-            LOGGER.log('###################################\n\nEvaluation Results:\n')
			
 
				+            print('###################################\n\nEvaluation Results:\n')
			
 
				 
			
 
				             for key, val in sorted(eval_results.items(), key=operator.itemgetter(0)):
			
 
				 
			
 
				                 if any(val in key for val in ["loss", "global_step", "Confusion_Matrix"]):
			
 
				                     continue
			
 
				 
			
 
				-                LOGGER.log('%s: %.3f' % (key, float(val)))
			
 
				+                Logger.log(
			
 
				+                    step=(),
			
 
				+                    data={"{prefix}.{key}".format(prefix=Logger._stage, key=key): float(val)},
			
 
				+                    verbosity=Logger.Verbosity.DEFAULT
			
 
				+                )
			
 
				 
			
 
				             fns = eval_results["Confusion_Matrix_FN"]
			
 
				             fps = eval_results["Confusion_Matrix_FP"]
			
@@ -624,12 +609,41 @@ class Runner(object):
 
				             tpr = np.divide(tps, positives)
			
 
				             tnr = np.divide(tns, negatives)
			
 
				 
			
 
				-            LOGGER.log('TP', tps)
			
 
				-            LOGGER.log('FN', fns)
			
 
				-            LOGGER.log('TN', tns)
			
 
				-            LOGGER.log('FP', fps)
			
 
				-            LOGGER.log('TPR', tpr)
			
 
				-            LOGGER.log('TNR', tnr)
			
 
				+            Logger.log(
			
 
				+                step=(num_steps,),
			
 
				+                data={"{prefix}.true_positives".format(prefix=Logger._stage): str(tps)},
			
 
				+                verbosity=Logger.Verbosity.DEFAULT
			
 
				+            )
			
 
				+
			
 
				+            Logger.log(
			
 
				+                step=(num_steps,),
			
 
				+                data={"{prefix}.true_negatives".format(prefix=Logger._stage): str(tns)},
			
 
				+                verbosity=Logger.Verbosity.DEFAULT
			
 
				+            )
			
 
				+
			
 
				+            Logger.log(
			
 
				+                step=(num_steps,),
			
 
				+                data={"{prefix}.false_positives".format(prefix=Logger._stage): str(fps)},
			
 
				+                verbosity=Logger.Verbosity.DEFAULT
			
 
				+            )
			
 
				+
			
 
				+            Logger.log(
			
 
				+                step=(num_steps,),
			
 
				+                data={"{prefix}.false_negatives".format(prefix=Logger._stage): str(fns)},
			
 
				+                verbosity=Logger.Verbosity.DEFAULT
			
 
				+            )
			
 
				+
			
 
				+            Logger.log(
			
 
				+                step=(num_steps,),
			
 
				+                data={"{prefix}.true_positive_rate".format(prefix=Logger._stage): str(["%.3f" % x for x in tpr])},
			
 
				+                verbosity=Logger.Verbosity.DEFAULT
			
 
				+            )
			
 
				+
			
 
				+            Logger.log(
			
 
				+                step=(num_steps,),
			
 
				+                data={"{prefix}.true_negative_rate".format(prefix=Logger._stage): str(["%.3f" % x for x in tnr])},
			
 
				+                verbosity=Logger.Verbosity.DEFAULT
			
 
				+            )
			
 
				 
			
 
				             if save_eval_results_to_json:
			
 
				 
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_1GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_1GPU.sh
@@ -15,13 +15,13 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet training in FP32-AMP on 1 GPU using 16 batch size (16 per GPU)
			
 
				-# Usage ./UNet_FP32AMP_1GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./UNet_AMP_1GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				-python ${BASEDIR}/../main.py \
			
 
				+python "${BASEDIR}/../main.py" \
			
 
				     --unet_variant='tinyUNet' \
			
 
				     --activation_fn='relu' \
			
 
				     --exec_mode='train_and_evaluate' \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_1GPU_XLA.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_1GPU_XLA.sh
@@ -0,0 +1,50 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches UNet training in FP32-AMP on 1 GPU using 16 batch size (16 per GPU)
			
 
				+# Usage ./UNet_AMP_1GPU_XLA.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+
			
 
				+BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				+
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+python "${BASEDIR}/../main.py" \
			
 
				+    --unet_variant='tinyUNet' \
			
 
				+    --activation_fn='relu' \
			
 
				+    --exec_mode='train_and_evaluate' \
			
 
				+    --iter_unit='batch' \
			
 
				+    --num_iter=2500 \
			
 
				+    --batch_size=16 \
			
 
				+    --warmup_step=10 \
			
 
				+    --results_dir="${1}" \
			
 
				+    --data_dir="${2}" \
			
 
				+    --dataset_name='DAGM2007' \
			
 
				+    --dataset_classID="${3}" \
			
 
				+    --data_format='NCHW' \
			
 
				+    --use_auto_loss_scaling \
			
 
				+    --use_tf_amp \
			
 
				+    --use_xla \
			
 
				+    --learning_rate=1e-4 \
			
 
				+    --learning_rate_decay_factor=0.8 \
			
 
				+    --learning_rate_decay_steps=500 \
			
 
				+    --rmsprop_decay=0.9 \
			
 
				+    --rmsprop_momentum=0.8 \
			
 
				+    --loss_fn_name='adaptive_loss' \
			
 
				+    --weight_decay=1e-5 \
			
 
				+    --weight_init_method='he_uniform' \
			
 
				+    --augment_data \
			
 
				+    --display_every=250 \
			
 
				+    --debug_verbosity=0
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_4GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_4GPU.sh
@@ -15,11 +15,11 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet training in FP32-AMP on 4 GPUs using 16 batch size (4 per GPU)
			
 
				-# Usage ./UNet_FP32AMP_4GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./UNet_AMP_4GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				 mpirun \
			
 
				     -np 4 \
			
@@ -31,7 +31,7 @@ mpirun \
 
				     -x PATH \
			
 
				     -mca pml ob1 -mca btl ^openib \
			
 
				     --allow-run-as-root \
			
 
				-    python ${BASEDIR}/../main.py \
			
 
				+    python "${BASEDIR}/../main.py" \
			
 
				         --unet_variant='tinyUNet' \
			
 
				         --activation_fn='relu' \
			
 
				         --exec_mode='train_and_evaluate' \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_4GPU_XLA.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_4GPU_XLA.sh
@@ -0,0 +1,60 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches UNet training in FP32-AMP on 4 GPUs using 16 batch size (4 per GPU)
			
 
				+# Usage ./UNet_AMP_4GPU_XLA.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+
			
 
				+BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				+
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+mpirun \
			
 
				+    -np 4 \
			
 
				+    -H localhost:4 \
			
 
				+    -bind-to none \
			
 
				+    -map-by slot \
			
 
				+    -x NCCL_DEBUG=VERSION \
			
 
				+    -x LD_LIBRARY_PATH \
			
 
				+    -x PATH \
			
 
				+    -mca pml ob1 -mca btl ^openib \
			
 
				+    --allow-run-as-root \
			
 
				+    python "${BASEDIR}/../main.py" \
			
 
				+        --unet_variant='tinyUNet' \
			
 
				+        --activation_fn='relu' \
			
 
				+        --exec_mode='train_and_evaluate' \
			
 
				+        --iter_unit='batch' \
			
 
				+        --num_iter=2500 \
			
 
				+        --batch_size=4 \
			
 
				+        --warmup_step=10 \
			
 
				+        --results_dir="${1}" \
			
 
				+        --data_dir="${2}" \
			
 
				+        --dataset_name='DAGM2007' \
			
 
				+        --dataset_classID="${3}" \
			
 
				+        --data_format='NCHW' \
			
 
				+        --use_auto_loss_scaling \
			
 
				+        --use_tf_amp \
			
 
				+        --use_xla \
			
 
				+        --learning_rate=1e-4 \
			
 
				+        --learning_rate_decay_factor=0.8 \
			
 
				+        --learning_rate_decay_steps=500 \
			
 
				+        --rmsprop_decay=0.9 \
			
 
				+        --rmsprop_momentum=0.8 \
			
 
				+        --loss_fn_name='adaptive_loss' \
			
 
				+        --weight_decay=1e-5 \
			
 
				+        --weight_init_method='he_uniform' \
			
 
				+        --augment_data \
			
 
				+        --display_every=250 \
			
 
				+        --debug_verbosity=0
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_8GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_8GPU.sh
@@ -15,11 +15,11 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet training in FP32-AMP on 8 GPUs using 16 batch size (2 per GPU)
			
 
				-# Usage ./UNet_FP32AMP_8GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./UNet_AMP_8GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				 mpirun \
			
 
				     -np 8 \
			
@@ -31,7 +31,7 @@ mpirun \
 
				     -x PATH \
			
 
				     -mca pml ob1 -mca btl ^openib \
			
 
				     --allow-run-as-root \
			
 
				-    python ${BASEDIR}/../main.py \
			
 
				+    python "${BASEDIR}/../main.py" \
			
 
				         --unet_variant='tinyUNet' \
			
 
				         --activation_fn='relu' \
			
 
				         --exec_mode='train_and_evaluate' \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_8GPU_XLA.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_8GPU_XLA.sh
@@ -0,0 +1,60 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches UNet training in FP32-AMP on 8 GPUs using 16 batch size (2 per GPU)
			
 
				+# Usage ./UNet_AMP_8GPU_XLA.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+
			
 
				+BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				+
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+mpirun \
			
 
				+    -np 8 \
			
 
				+    -H localhost:8 \
			
 
				+    -bind-to none \
			
 
				+    -map-by slot \
			
 
				+    -x NCCL_DEBUG=VERSION \
			
 
				+    -x LD_LIBRARY_PATH \
			
 
				+    -x PATH \
			
 
				+    -mca pml ob1 -mca btl ^openib \
			
 
				+    --allow-run-as-root \
			
 
				+    python "${BASEDIR}/../main.py" \
			
 
				+        --unet_variant='tinyUNet' \
			
 
				+        --activation_fn='relu' \
			
 
				+        --exec_mode='train_and_evaluate' \
			
 
				+        --iter_unit='batch' \
			
 
				+        --num_iter=2500 \
			
 
				+        --batch_size=2 \
			
 
				+        --warmup_step=10 \
			
 
				+        --results_dir="${1}" \
			
 
				+        --data_dir="${2}" \
			
 
				+        --dataset_name='DAGM2007' \
			
 
				+        --dataset_classID="${3}" \
			
 
				+        --data_format='NCHW' \
			
 
				+        --use_auto_loss_scaling \
			
 
				+        --use_tf_amp \
			
 
				+        --use_xla \
			
 
				+        --learning_rate=1e-4 \
			
 
				+        --learning_rate_decay_factor=0.8 \
			
 
				+        --learning_rate_decay_steps=500 \
			
 
				+        --rmsprop_decay=0.9 \
			
 
				+        --rmsprop_momentum=0.8 \
			
 
				+        --loss_fn_name='adaptive_loss' \
			
 
				+        --weight_decay=1e-5 \
			
 
				+        --weight_init_method='he_uniform' \
			
 
				+        --augment_data \
			
 
				+        --display_every=250 \
			
 
				+        --debug_verbosity=0
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_EVAL.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_EVAL.sh
@@ -15,13 +15,13 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet evaluation in FP32-AMP on 1 GPUs using 16 batch size
			
 
				-# Usage ./UNet_FP32AMP_EVAL.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./UNet_AMP_EVAL.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				-python ${BASEDIR}/../main.py \
			
 
				+python "${BASEDIR}/../main.py" \
			
 
				     --unet_variant='tinyUNet' \
			
 
				     --activation_fn='relu' \
			
 
				     --exec_mode='evaluate' \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_EVAL_XLA.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_AMP_EVAL_XLA.sh
@@ -0,0 +1,50 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches UNet evaluation in FP32-AMP on 1 GPUs using 16 batch size
			
 
				+# Usage ./UNet_AMP_EVAL_XLA.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+
			
 
				+BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				+
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+python "${BASEDIR}/../main.py" \
			
 
				+    --unet_variant='tinyUNet' \
			
 
				+    --activation_fn='relu' \
			
 
				+    --exec_mode='evaluate' \
			
 
				+    --iter_unit='epoch' \
			
 
				+    --num_iter=1 \
			
 
				+    --batch_size=16 \
			
 
				+    --warmup_step=10 \
			
 
				+    --results_dir="${1}" \
			
 
				+    --data_dir="${2}" \
			
 
				+    --dataset_name='DAGM2007' \
			
 
				+    --dataset_classID="${3}" \
			
 
				+    --data_format='NCHW' \
			
 
				+    --use_auto_loss_scaling \
			
 
				+    --use_tf_amp \
			
 
				+    --use_xla \
			
 
				+    --learning_rate=1e-4 \
			
 
				+    --learning_rate_decay_factor=0.8 \
			
 
				+    --learning_rate_decay_steps=500 \
			
 
				+    --rmsprop_decay=0.9 \
			
 
				+    --rmsprop_momentum=0.8 \
			
 
				+    --loss_fn_name='adaptive_loss' \
			
 
				+    --weight_decay=1e-5 \
			
 
				+    --weight_init_method='he_uniform' \
			
 
				+    --augment_data \
			
 
				+    --display_every=50 \
			
 
				+    --debug_verbosity=0
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_1GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_1GPU.sh
@@ -19,9 +19,9 @@
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				-python ${BASEDIR}/../main.py \
			
 
				+python "${BASEDIR}/../main.py" \
			
 
				     --unet_variant='tinyUNet' \
			
 
				     --activation_fn='relu' \
			
 
				     --exec_mode='train_and_evaluate' \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_1GPU_XLA.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_1GPU_XLA.sh
@@ -0,0 +1,50 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches UNet training in FP32 on 1 GPU using 16 batch size (16 per GPU)
			
 
				+# Usage ./UNet_FP32_1GPU_XLA.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+
			
 
				+BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				+
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+python "${BASEDIR}/../main.py" \
			
 
				+    --unet_variant='tinyUNet' \
			
 
				+    --activation_fn='relu' \
			
 
				+    --exec_mode='train_and_evaluate' \
			
 
				+    --iter_unit='batch' \
			
 
				+    --num_iter=2500 \
			
 
				+    --batch_size=16 \
			
 
				+    --warmup_step=10 \
			
 
				+    --results_dir="${1}" \
			
 
				+    --data_dir="${2}" \
			
 
				+    --dataset_name='DAGM2007' \
			
 
				+    --dataset_classID="${3}" \
			
 
				+    --data_format='NCHW' \
			
 
				+    --use_auto_loss_scaling \
			
 
				+    --nouse_tf_amp \
			
 
				+    --use_xla \
			
 
				+    --learning_rate=1e-4 \
			
 
				+    --learning_rate_decay_factor=0.8 \
			
 
				+    --learning_rate_decay_steps=500 \
			
 
				+    --rmsprop_decay=0.9 \
			
 
				+    --rmsprop_momentum=0.8 \
			
 
				+    --loss_fn_name='adaptive_loss' \
			
 
				+    --weight_decay=1e-5 \
			
 
				+    --weight_init_method='he_uniform' \
			
 
				+    --augment_data \
			
 
				+    --display_every=250 \
			
 
				+    --debug_verbosity=0
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_4GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_4GPU.sh
@@ -19,7 +19,7 @@
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				 mpirun \
			
 
				     -np 4 \
			
@@ -31,7 +31,7 @@ mpirun \
 
				     -x PATH \
			
 
				     -mca pml ob1 -mca btl ^openib \
			
 
				     --allow-run-as-root \
			
 
				-    python ${BASEDIR}/../main.py \
			
 
				+    python "${BASEDIR}/../main.py" \
			
 
				         --unet_variant='tinyUNet' \
			
 
				         --activation_fn='relu' \
			
 
				         --exec_mode='train_and_evaluate' \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_4GPU_XLA.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_4GPU_XLA.sh
@@ -0,0 +1,60 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches UNet training in FP32 on 4 GPUs using 16 batch size (4 per GPU)
			
 
				+# Usage ./UNet_FP32_4GPU_XLA.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+
			
 
				+BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				+
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+mpirun \
			
 
				+    -np 4 \
			
 
				+    -H localhost:4 \
			
 
				+    -bind-to none \
			
 
				+    -map-by slot \
			
 
				+    -x NCCL_DEBUG=VERSION \
			
 
				+    -x LD_LIBRARY_PATH \
			
 
				+    -x PATH \
			
 
				+    -mca pml ob1 -mca btl ^openib \
			
 
				+    --allow-run-as-root \
			
 
				+    python "${BASEDIR}/../main.py" \
			
 
				+        --unet_variant='tinyUNet' \
			
 
				+        --activation_fn='relu' \
			
 
				+        --exec_mode='train_and_evaluate' \
			
 
				+        --iter_unit='batch' \
			
 
				+        --num_iter=2500 \
			
 
				+        --batch_size=4 \
			
 
				+        --warmup_step=10 \
			
 
				+        --results_dir="${1}" \
			
 
				+        --data_dir="${2}" \
			
 
				+        --dataset_name='DAGM2007' \
			
 
				+        --dataset_classID="${3}" \
			
 
				+        --data_format='NCHW' \
			
 
				+        --use_auto_loss_scaling \
			
 
				+        --nouse_tf_amp \
			
 
				+        --use_xla \
			
 
				+        --learning_rate=1e-4 \
			
 
				+        --learning_rate_decay_factor=0.8 \
			
 
				+        --learning_rate_decay_steps=500 \
			
 
				+        --rmsprop_decay=0.9 \
			
 
				+        --rmsprop_momentum=0.8 \
			
 
				+        --loss_fn_name='adaptive_loss' \
			
 
				+        --weight_decay=1e-5 \
			
 
				+        --weight_init_method='he_uniform' \
			
 
				+        --augment_data \
			
 
				+        --display_every=250 \
			
 
				+        --debug_verbosity=0
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_8GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_8GPU.sh
@@ -19,7 +19,7 @@
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				 mpirun \
			
 
				     -np 8 \
			
@@ -31,7 +31,7 @@ mpirun \
 
				     -x PATH \
			
 
				     -mca pml ob1 -mca btl ^openib \
			
 
				     --allow-run-as-root \
			
 
				-    python ${BASEDIR}/../main.py \
			
 
				+    python "${BASEDIR}/../main.py" \
			
 
				         --unet_variant='tinyUNet' \
			
 
				         --activation_fn='relu' \
			
 
				         --exec_mode='train_and_evaluate' \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_8GPU_XLA.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_8GPU_XLA.sh
@@ -0,0 +1,60 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches UNet training in FP32 on 8 GPUs using 16 batch size (2 per GPU)
			
 
				+# Usage ./UNet_FP32_8GPU_XLA.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+
			
 
				+BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				+
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+mpirun \
			
 
				+    -np 8 \
			
 
				+    -H localhost:8 \
			
 
				+    -bind-to none \
			
 
				+    -map-by slot \
			
 
				+    -x NCCL_DEBUG=VERSION \
			
 
				+    -x LD_LIBRARY_PATH \
			
 
				+    -x PATH \
			
 
				+    -mca pml ob1 -mca btl ^openib \
			
 
				+    --allow-run-as-root \
			
 
				+    python "${BASEDIR}/../main.py" \
			
 
				+        --unet_variant='tinyUNet' \
			
 
				+        --activation_fn='relu' \
			
 
				+        --exec_mode='train_and_evaluate' \
			
 
				+        --iter_unit='batch' \
			
 
				+        --num_iter=2500 \
			
 
				+        --batch_size=2 \
			
 
				+        --warmup_step=10 \
			
 
				+        --results_dir="${1}" \
			
 
				+        --data_dir="${2}" \
			
 
				+        --dataset_name='DAGM2007' \
			
 
				+        --dataset_classID="${3}" \
			
 
				+        --data_format='NCHW' \
			
 
				+        --use_auto_loss_scaling \
			
 
				+        --nouse_tf_amp \
			
 
				+        --use_xla \
			
 
				+        --learning_rate=1e-4 \
			
 
				+        --learning_rate_decay_factor=0.8 \
			
 
				+        --learning_rate_decay_steps=500 \
			
 
				+        --rmsprop_decay=0.9 \
			
 
				+        --rmsprop_momentum=0.8 \
			
 
				+        --loss_fn_name='adaptive_loss' \
			
 
				+        --weight_decay=1e-5 \
			
 
				+        --weight_init_method='he_uniform' \
			
 
				+        --augment_data \
			
 
				+        --display_every=250 \
			
 
				+        --debug_verbosity=0
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_EVAL.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_EVAL.sh
@@ -19,9 +19,9 @@
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				-python ${BASEDIR}/../main.py \
			
 
				+python "${BASEDIR}/../main.py" \
			
 
				     --unet_variant='tinyUNet' \
			
 
				     --activation_fn='relu' \
			
 
				     --exec_mode='evaluate' \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_EVAL_XLA.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/UNet_FP32_EVAL_XLA.sh
@@ -0,0 +1,50 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches UNet evaluation in FP32 on 1 GPUs using 16 batch size
			
 
				+# Usage ./UNet_FP32_EVAL_XLA.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+
			
 
				+BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				+
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+python "${BASEDIR}/../main.py" \
			
 
				+    --unet_variant='tinyUNet' \
			
 
				+    --activation_fn='relu' \
			
 
				+    --exec_mode='evaluate' \
			
 
				+    --iter_unit='epoch' \
			
 
				+    --num_iter=1 \
			
 
				+    --batch_size=16 \
			
 
				+    --warmup_step=10 \
			
 
				+    --results_dir="${1}" \
			
 
				+    --data_dir="${2}" \
			
 
				+    --dataset_name='DAGM2007' \
			
 
				+    --dataset_classID="${3}" \
			
 
				+    --data_format='NCHW' \
			
 
				+    --use_auto_loss_scaling \
			
 
				+    --nouse_tf_amp \
			
 
				+    --use_xla \
			
 
				+    --learning_rate=1e-4 \
			
 
				+    --learning_rate_decay_factor=0.8 \
			
 
				+    --learning_rate_decay_steps=500 \
			
 
				+    --rmsprop_decay=0.9 \
			
 
				+    --rmsprop_momentum=0.8 \
			
 
				+    --loss_fn_name='adaptive_loss' \
			
 
				+    --weight_decay=1e-5 \
			
 
				+    --weight_init_method='he_uniform' \
			
 
				+    --augment_data \
			
 
				+    --display_every=50 \
			
 
				+    --debug_verbosity=0
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_evalbench_AMP.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_evalbench_AMP.sh
@@ -15,13 +15,17 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet evaluation benchmark in FP32-AMP on 1 GPUs using 16 batch size
			
 
				-# Usage ./DGX1v_evalbench_FP32AMP.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./DGX1v_evalbench_AMP.sh <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				-python ${BASEDIR}/../../main.py \
			
 
				+# Cleaning up for benchmark
			
 
				+RESULT_DIR="/tmp"
			
 
				+rm -rf "${RESULT_DIR}"
			
 
				+
			
 
				+python "${BASEDIR}/../../main.py" \
			
 
				     --unet_variant='tinyUNet' \
			
 
				     --activation_fn='relu' \
			
 
				     --exec_mode='inference_benchmark' \
			
@@ -29,14 +33,14 @@ python ${BASEDIR}/../../main.py \
 
				     --num_iter=1500 \
			
 
				     --batch_size=16 \
			
 
				     --warmup_step=500 \
			
 
				-    --results_dir="${1}" \
			
 
				-    --data_dir="${2}" \
			
 
				+    --results_dir="${RESULT_DIR}" \
			
 
				+    --data_dir="${1}" \
			
 
				     --dataset_name='DAGM2007' \
			
 
				-    --dataset_classID="${3}" \
			
 
				+    --dataset_classID="${2}" \
			
 
				     --data_format='NCHW' \
			
 
				     --use_auto_loss_scaling \
			
 
				     --use_tf_amp \
			
 
				-    --nouse_xla \
			
 
				+    --use_xla \
			
 
				     --learning_rate=1e-4 \
			
 
				     --learning_rate_decay_factor=0.8 \
			
 
				     --learning_rate_decay_steps=500 \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_evalbench_FP32.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_evalbench_FP32.sh
@@ -15,13 +15,17 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet evaluation benchmark in FP32 on 1 GPUs using 16 batch size
			
 
				-# Usage ./DGX1v_evalbench_FP32.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./DGX1v_evalbench_FP32.sh <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				-python ${BASEDIR}/../../main.py \
			
 
				+# Cleaning up for benchmark
			
 
				+RESULT_DIR="/tmp"
			
 
				+rm -rf "${RESULT_DIR}"
			
 
				+
			
 
				+python "${BASEDIR}/../../main.py" \
			
 
				     --unet_variant='tinyUNet' \
			
 
				     --activation_fn='relu' \
			
 
				     --exec_mode='inference_benchmark' \
			
@@ -29,14 +33,14 @@ python ${BASEDIR}/../../main.py \
 
				     --num_iter=1500 \
			
 
				     --batch_size=16 \
			
 
				     --warmup_step=500 \
			
 
				-    --results_dir="${1}" \
			
 
				-    --data_dir="${2}" \
			
 
				+    --results_dir="${RESULT_DIR}" \
			
 
				+    --data_dir="${1}" \
			
 
				     --dataset_name='DAGM2007' \
			
 
				-    --dataset_classID="${3}" \
			
 
				+    --dataset_classID="${2}" \
			
 
				     --data_format='NCHW' \
			
 
				     --use_auto_loss_scaling \
			
 
				     --nouse_tf_amp \
			
 
				-    --nouse_xla \
			
 
				+    --use_xla \
			
 
				     --learning_rate=1e-4 \
			
 
				     --learning_rate_decay_factor=0.8 \
			
 
				     --learning_rate_decay_steps=500 \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_AMP_1GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_AMP_1GPU.sh
@@ -15,13 +15,17 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet training benchmark in FP32-AMP on 1 GPU using 16 batch size (16 per GPU)
			
 
				-# Usage ./DGX1v_trainbench_FP32AMP_1GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./DGX1v_trainbench_AMP_1GPU.sh <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				-python ${BASEDIR}/../../main.py \
			
 
				+# Cleaning up for benchmark
			
 
				+RESULT_DIR="/tmp"
			
 
				+rm -rf "${RESULT_DIR}"
			
 
				+
			
 
				+python "${BASEDIR}/../../main.py" \
			
 
				     --unet_variant='tinyUNet' \
			
 
				     --activation_fn='relu' \
			
 
				     --exec_mode='training_benchmark' \
			
@@ -29,14 +33,14 @@ python ${BASEDIR}/../../main.py \
 
				     --num_iter=1500 \
			
 
				     --batch_size=16 \
			
 
				     --warmup_step=500 \
			
 
				-    --results_dir="${1}" \
			
 
				-    --data_dir="${2}" \
			
 
				+    --results_dir="${RESULT_DIR}" \
			
 
				+    --data_dir="${1}" \
			
 
				     --dataset_name='DAGM2007' \
			
 
				-    --dataset_classID="${3}" \
			
 
				+    --dataset_classID="${2}" \
			
 
				     --data_format='NCHW' \
			
 
				     --use_auto_loss_scaling \
			
 
				     --use_tf_amp \
			
 
				-    --nouse_xla \
			
 
				+    --use_xla \
			
 
				     --learning_rate=1e-4 \
			
 
				     --learning_rate_decay_factor=0.8 \
			
 
				     --learning_rate_decay_steps=500 \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_AMP_4GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_AMP_4GPU.sh
@@ -15,11 +15,15 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet training benchmark in FP32-AMP on 4 GPUs using 16 batch size (4 per GPU)
			
 
				-# Usage ./DGX1v_trainbench_FP32AMP_4GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./DGX1v_trainbench_AMP_4GPU.sh <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+# Cleaning up for benchmark
			
 
				+RESULT_DIR="/tmp"
			
 
				+rm -rf "${RESULT_DIR}"
			
 
				 
			
 
				 mpirun \
			
 
				     -np 4 \
			
@@ -31,7 +35,7 @@ mpirun \
 
				     -x PATH \
			
 
				     -mca pml ob1 -mca btl ^openib \
			
 
				     --allow-run-as-root \
			
 
				-    python ${BASEDIR}/../../main.py \
			
 
				+    python "${BASEDIR}/../../main.py" \
			
 
				         --unet_variant='tinyUNet' \
			
 
				         --activation_fn='relu' \
			
 
				         --exec_mode='training_benchmark' \
			
@@ -39,14 +43,14 @@ mpirun \
 
				         --num_iter=1500 \
			
 
				         --batch_size=4 \
			
 
				         --warmup_step=500 \
			
 
				-        --results_dir="${1}" \
			
 
				-        --data_dir="${2}" \
			
 
				+        --results_dir="${RESULT_DIR}" \
			
 
				+        --data_dir="${1}" \
			
 
				         --dataset_name='DAGM2007' \
			
 
				-        --dataset_classID="${3}" \
			
 
				+        --dataset_classID="${2}" \
			
 
				         --data_format='NCHW' \
			
 
				         --use_auto_loss_scaling \
			
 
				         --use_tf_amp \
			
 
				-        --nouse_xla \
			
 
				+        --use_xla \
			
 
				         --learning_rate=1e-4 \
			
 
				         --learning_rate_decay_factor=0.8 \
			
 
				         --learning_rate_decay_steps=500 \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_AMP_8GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_AMP_8GPU.sh
@@ -15,11 +15,15 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet training benchmark in FP32-AMP on 8 GPUs using 16 batch size (2 per GPU)
			
 
				-# Usage ./DGX1v_trainbench_FP32AMP_8GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./DGX1v_trainbench_AMP_8GPU.sh <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+# Cleaning up for benchmark
			
 
				+RESULT_DIR="/tmp"
			
 
				+rm -rf "${RESULT_DIR}"
			
 
				 
			
 
				 mpirun \
			
 
				     -np 8 \
			
@@ -31,7 +35,7 @@ mpirun \
 
				     -x PATH \
			
 
				     -mca pml ob1 -mca btl ^openib \
			
 
				     --allow-run-as-root \
			
 
				-    python ${BASEDIR}/../../main.py \
			
 
				+    python "${BASEDIR}/../../main.py" \
			
 
				         --unet_variant='tinyUNet' \
			
 
				         --activation_fn='relu' \
			
 
				         --exec_mode='training_benchmark' \
			
@@ -39,14 +43,14 @@ mpirun \
 
				         --num_iter=1500 \
			
 
				         --batch_size=2 \
			
 
				         --warmup_step=500 \
			
 
				-        --results_dir="${1}" \
			
 
				-        --data_dir="${2}" \
			
 
				+        --results_dir="${RESULT_DIR}" \
			
 
				+        --data_dir="${1}" \
			
 
				         --dataset_name='DAGM2007' \
			
 
				-        --dataset_classID="${3}" \
			
 
				+        --dataset_classID="${2}" \
			
 
				         --data_format='NCHW' \
			
 
				         --use_auto_loss_scaling \
			
 
				         --use_tf_amp \
			
 
				-        --nouse_xla \
			
 
				+        --use_xla \
			
 
				         --learning_rate=1e-4 \
			
 
				         --learning_rate_decay_factor=0.8 \
			
 
				         --learning_rate_decay_steps=500 \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_FP32_1GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_FP32_1GPU.sh
@@ -15,13 +15,17 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet training benchmark in FP32 on 1 GPU using 16 batch size (16 per GPU)
			
 
				-# Usage ./DGX1v_trainbench_FP32_1GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./DGX1v_trainbench_FP32_1GPU.sh <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				 
			
 
				-python ${BASEDIR}/../../main.py \
			
 
				+# Cleaning up for benchmark
			
 
				+RESULT_DIR="/tmp"
			
 
				+rm -rf "${RESULT_DIR}"
			
 
				+
			
 
				+python "${BASEDIR}/../../main.py" \
			
 
				     --unet_variant='tinyUNet' \
			
 
				     --activation_fn='relu' \
			
 
				     --exec_mode='training_benchmark' \
			
@@ -29,14 +33,14 @@ python ${BASEDIR}/../../main.py \
 
				     --num_iter=1500 \
			
 
				     --batch_size=16 \
			
 
				     --warmup_step=500 \
			
 
				-    --results_dir="${1}" \
			
 
				-    --data_dir="${2}" \
			
 
				+    --results_dir="${RESULT_DIR}" \
			
 
				+    --data_dir="${1}" \
			
 
				     --dataset_name='DAGM2007' \
			
 
				-    --dataset_classID="${3}" \
			
 
				+    --dataset_classID="${2}" \
			
 
				     --data_format='NCHW' \
			
 
				     --use_auto_loss_scaling \
			
 
				     --nouse_tf_amp \
			
 
				-    --nouse_xla \
			
 
				+    --use_xla \
			
 
				     --learning_rate=1e-4 \
			
 
				     --learning_rate_decay_factor=0.8 \
			
 
				     --learning_rate_decay_steps=500 \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_FP32_4GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_FP32_4GPU.sh
@@ -15,11 +15,15 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet training benchmark in FP32 on 4 GPUs using 16 batch size (4 per GPU)
			
 
				-# Usage ./DGX1v_trainbench_FP32_4GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./DGX1v_trainbench_FP32_4GPU.sh <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+# Cleaning up for benchmark
			
 
				+RESULT_DIR="/tmp"
			
 
				+rm -rf "${RESULT_DIR}"
			
 
				 
			
 
				 mpirun \
			
 
				     -np 4 \
			
@@ -31,7 +35,7 @@ mpirun \
 
				     -x PATH \
			
 
				     -mca pml ob1 -mca btl ^openib \
			
 
				     --allow-run-as-root \
			
 
				-    python ${BASEDIR}/../../main.py \
			
 
				+    python "${BASEDIR}/../../main.py" \
			
 
				         --unet_variant='tinyUNet' \
			
 
				         --activation_fn='relu' \
			
 
				         --exec_mode='training_benchmark' \
			
@@ -39,14 +43,14 @@ mpirun \
 
				         --num_iter=1500 \
			
 
				         --batch_size=4 \
			
 
				         --warmup_step=500 \
			
 
				-        --results_dir="${1}" \
			
 
				-        --data_dir="${2}" \
			
 
				+        --results_dir="${RESULT_DIR}" \
			
 
				+        --data_dir="${1}" \
			
 
				         --dataset_name='DAGM2007' \
			
 
				-        --dataset_classID="${3}" \
			
 
				+        --dataset_classID="${2}" \
			
 
				         --data_format='NCHW' \
			
 
				         --use_auto_loss_scaling \
			
 
				         --nouse_tf_amp \
			
 
				-        --nouse_xla \
			
 
				+        --use_xla \
			
 
				         --learning_rate=1e-4 \
			
 
				         --learning_rate_decay_factor=0.8 \
			
 
				         --learning_rate_decay_steps=500 \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_FP32_8GPU.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/benchmarking/DGX1v_trainbench_FP32_8GPU.sh
@@ -15,11 +15,15 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 # This script launches UNet training benchmark in FP32 on 8 GPUs using 16 batch size (2 per GPU)
			
 
				-# Usage ./DGX1v_trainbench_FP32_8GPU.sh <path to result repository> <path to dataset> <dagm classID (1-10)>
			
 
				+# Usage ./DGX1v_trainbench_FP32_8GPU.sh <path to dataset> <dagm classID (1-10)>
			
 
				 
			
 
				 BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
			
 
				 
			
 
				-pip install ${BASEDIR}/../../dllogger/
			
 
				+export TF_CPP_MIN_LOG_LEVEL=3
			
 
				+
			
 
				+# Cleaning up for benchmark
			
 
				+RESULT_DIR="/tmp"
			
 
				+rm -rf "${RESULT_DIR}"
			
 
				 
			
 
				 mpirun \
			
 
				     -np 8 \
			
@@ -31,7 +35,7 @@ mpirun \
 
				     -x PATH \
			
 
				     -mca pml ob1 -mca btl ^openib \
			
 
				     --allow-run-as-root \
			
 
				-    python ${BASEDIR}/../../main.py \
			
 
				+    python "${BASEDIR}/../../main.py" \
			
 
				         --unet_variant='tinyUNet' \
			
 
				         --activation_fn='relu' \
			
 
				         --exec_mode='training_benchmark' \
			
@@ -39,14 +43,14 @@ mpirun \
 
				         --num_iter=1500 \
			
 
				         --batch_size=2 \
			
 
				         --warmup_step=500 \
			
 
				-        --results_dir="${1}" \
			
 
				-        --data_dir="${2}" \
			
 
				+        --results_dir="${RESULT_DIR}" \
			
 
				+        --data_dir="${1}" \
			
 
				         --dataset_name='DAGM2007' \
			
 
				-        --dataset_classID="${3}" \
			
 
				+        --dataset_classID="${2}" \
			
 
				         --data_format='NCHW' \
			
 
				         --use_auto_loss_scaling \
			
 
				         --nouse_tf_amp \
			
 
				-        --nouse_xla \
			
 
				+        --use_xla \
			
 
				         --learning_rate=1e-4 \
			
 
				         --learning_rate_decay_factor=0.8 \
			
 
				         --learning_rate_decay_steps=500 \
			
--- a/TensorFlow/Segmentation/UNet_Industrial/scripts/launch_docker.sh
+++ b/TensorFlow/Segmentation/UNet_Industrial/scripts/launch_docker.sh
@@ -0,0 +1,24 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+DATASET_DIR=$(realpath -s $1)
			
 
				+RESULT_DIR=$(realpath -s $2)
			
 
				+
			
 
				+if [[ ! -e ${DATASET_DIR} ]]; then
			
 
				+    echo "creating ${DATASET_DIR} ..."
			
 
				+    mkdir -p "${DATASET_DIR}"
			
 
				+fi
			
 
				+
			
 
				+if [[ ! -e ${RESULT_DIR} ]]; then
			
 
				+    echo "creating ${RESULT_DIR} ..."
			
 
				+    mkdir -p "${RESULT_DIR}"
			
 
				+fi
			
 
				+
			
 
				+# Build the docker container
			
 
				+docker build . --rm -t unet_industrial:latest
			
 
				+
			
 
				+# start the container with nvidia-docker
			
 
				+nvidia-docker run -it --rm \
			
 
				+    --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 \
			
 
				+    -v ${DATASET_DIR}:/data/dagm2007/ \
			
 
				+    -v ${RESULT_DIR}:/results \
			
 
				+    unet_industrial:latest
			
--- a/TensorFlow/Segmentation/UNet_Industrial/utils/__init__.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/utils/__init__.py
@@ -24,5 +24,6 @@ from utils import hooks
 
				 from utils import cmdline_helper
			
 
				 from utils import hvd_utils
			
 
				 from utils import image_processing
			
 
				+from utils import logging
			
 
				 from utils import losses
			
 
				 from utils import metrics
			
--- a/TensorFlow/Segmentation/UNet_Industrial/utils/cmdline_helper.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/utils/cmdline_helper.py
@@ -95,6 +95,14 @@ def parse_cmdline():
 
				         help="""Directory in which to write training logs, summaries and checkpoints."""
			
 
				     )
			
 
				 
			
 
				+    p.add_argument(
			
 
				+        '--log_dir',
			
 
				+        type=str,
			
 
				+        required=False,
			
 
				+        default="dlloger_out.json",
			
 
				+        help="""Directory in which to write logs."""
			
 
				+    )
			
 
				+
			
 
				     _add_bool_argument(
			
 
				         parser=p,
			
 
				         name="save_eval_results_to_json",
			
@@ -151,11 +159,11 @@ def parse_cmdline():
 
				         help="""Which initialisation method is used to randomly intialize the model during training"""
			
 
				     )
			
 
				 
			
 
				-    p.add_argument('--learning_rate', default=1e-5, type=float, required=False, help="""Learning rate value.""")
			
 
				+    p.add_argument('--learning_rate', default=1e-4, type=float, required=False, help="""Learning rate value.""")
			
 
				 
			
 
				     p.add_argument(
			
 
				         '--learning_rate_decay_factor',
			
 
				-        default=0.75,
			
 
				+        default=0.8,
			
 
				         type=float,
			
 
				         required=False,
			
 
				         help="""Decay factor to decrease the learning rate."""
			
@@ -173,7 +181,7 @@ def parse_cmdline():
 
				 
			
 
				     p.add_argument('--rmsprop_momentum', default=0.8, type=float, required=False, help="""RMSProp - Momentum value.""")
			
 
				 
			
 
				-    p.add_argument('--weight_decay', default=1e-4, type=float, required=False, help="""Weight Decay scale factor""")
			
 
				+    p.add_argument('--weight_decay', default=1e-5, type=float, required=False, help="""Weight Decay scale factor""")
			
 
				 
			
 
				     _add_bool_argument(
			
 
				         parser=p, name="use_auto_loss_scaling", default=False, required=False, help="Use AutoLossScaling with TF-AMP"
			
--- a/TensorFlow/Segmentation/UNet_Industrial/utils/hooks/profiler_hook.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/utils/hooks/profiler_hook.py
@@ -27,7 +27,7 @@ import operator
 
				 import numpy as np
			
 
				 import tensorflow as tf
			
 
				 
			
 
				-from dllogger.logger import LOGGER
			
 
				+import dllogger as Logger
			
 
				 
			
 
				 __all__ = ["ProfilerHook"]
			
 
				 
			
@@ -60,15 +60,79 @@ class ProfilerHook(tf.train.SessionRunHook):
 
				         ret[n:] = ret[n:] - ret[:-n]
			
 
				         return ret[n - 1:] / n
			
 
				 
			
 
				-    def begin(self):
			
 
				-        LOGGER.log_hardware()
			
 
				-
			
 
				     def after_create_session(self, session, coord):
			
 
				 
			
 
				         params_count = tf.get_default_graph().get_tensor_by_name("trainable_parameters_count_ref:0")
			
 
				         _params_count = session.run(params_count)
			
 
				 
			
 
				-        LOGGER.log("# Total Trainable Parameters:", int(_params_count))
			
 
				+        Logger._stage = "train" if self._is_training else "eval"
			
 
				+
			
 
				+        Logger.log(
			
 
				+            step=(),
			
 
				+            data={"# Total Trainable Parameters": int(_params_count)}, verbosity=Logger.Verbosity.DEFAULT
			
 
				+        )
			
 
				+
			
 
				+        Logger.metadata(
			
 
				+            metric="{prefix}.avg_ips".format(prefix=Logger._stage),
			
 
				+            metadata={"unit": "imgs/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": Logger._stage.upper()}
			
 
				+        )
			
 
				+
			
 
				+        for ths in [0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99]:
			
 
				+            Logger.metadata(
			
 
				+                metric="{prefix}.IoU_THS_{ths}".format(prefix=Logger._stage, ths=ths),
			
 
				+                metadata={"format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": Logger._stage.upper()}
			
 
				+            )
			
 
				+
			
 
				+        if self._is_training:
			
 
				+            Logger.metadata(
			
 
				+                metric="{prefix}.learning_rate".format(prefix=Logger._stage),
			
 
				+                metadata={"format": ":.3e", "GOAL": "NONE", "STAGE": Logger._stage.upper()}
			
 
				+            )
			
 
				+
			
 
				+            Logger.metadata(
			
 
				+                metric="{prefix}.weight_decay".format(prefix=Logger._stage),
			
 
				+                metadata={"format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": Logger._stage.upper()}
			
 
				+            )
			
 
				+
			
 
				+            Logger.metadata(
			
 
				+                metric="{prefix}.reconstruction_loss".format(prefix=Logger._stage),
			
 
				+                metadata={"format": ":.3f", "GOAL": "MINIMIZE", "STAGE": Logger._stage.upper()}
			
 
				+            )
			
 
				+
			
 
				+            Logger.metadata(
			
 
				+                metric="{prefix}.total_loss".format(prefix=Logger._stage),
			
 
				+                metadata={"format": ":.3f", "GOAL": "MINIMIZE", "STAGE": Logger._stage.upper()}
			
 
				+            )
			
 
				+
			
 
				+        Logger.metadata(
			
 
				+            metric="{prefix}.true_positives".format(prefix=Logger._stage),
			
 
				+            metadata={"STAGE": Logger._stage.upper()}
			
 
				+        )
			
 
				+
			
 
				+        Logger.metadata(
			
 
				+            metric="{prefix}.true_negatives".format(prefix=Logger._stage),
			
 
				+            metadata={"STAGE": Logger._stage.upper()}
			
 
				+        )
			
 
				+
			
 
				+        Logger.metadata(
			
 
				+            metric="{prefix}.false_positives".format(prefix=Logger._stage),
			
 
				+            metadata={"STAGE": Logger._stage.upper()}
			
 
				+        )
			
 
				+
			
 
				+        Logger.metadata(
			
 
				+            metric="{prefix}.false_negatives".format(prefix=Logger._stage),
			
 
				+            metadata={"STAGE": Logger._stage.upper()}
			
 
				+        )
			
 
				+
			
 
				+        Logger.metadata(
			
 
				+            metric="{prefix}.true_positive_rate".format(prefix=Logger._stage),
			
 
				+            metadata={"STAGE": Logger._stage.upper()}
			
 
				+        )
			
 
				+
			
 
				+        Logger.metadata(
			
 
				+            metric="{prefix}.true_negative_rate".format(prefix=Logger._stage),
			
 
				+            metadata={"STAGE": Logger._stage.upper()}
			
 
				+        )
			
 
				 
			
 
				         self._start_training_time = time.time()
			
 
				 
			
@@ -154,22 +218,64 @@ class ProfilerHook(tf.train.SessionRunHook):
 
				             if self._current_step > self._warmup_steps:
			
 
				                 imgs_per_sec = float(ProfilerHook.moving_average(self._processing_speed_arr, n=30)[-1])
			
 
				 
			
 
				-            LOGGER.log("iteration", int(self._current_step))
			
 
				-            LOGGER.log("total_ips", float(imgs_per_sec))
			
 
				+            Logger.log(
			
 
				+                step=(self._current_step,),
			
 
				+                data={"{prefix}.avg_ips".format(prefix=Logger._stage): float(imgs_per_sec)},
			
 
				+                verbosity=Logger.Verbosity.DEFAULT
			
 
				+            )
			
 
				 
			
 
				             if self._is_training:
			
 
				-                LOGGER.log("weight_decay", float(run_values.results["weight_decay"]))
			
 
				-                LOGGER.log("reconstruction_loss", float(run_values.results["reconstruction_loss"]))
			
 
				-                LOGGER.log("total_loss", float(run_values.results["total_loss"]))
			
 
				-                LOGGER.log("learning_rate", float(run_values.results["learning_rate"]))
			
 
				+                Logger.log(
			
 
				+                    step=(self._current_step,),
			
 
				+                    data={"{prefix}.weight_decay".format(prefix=Logger._stage): float(run_values.results["weight_decay"])},
			
 
				+                    verbosity=Logger.Verbosity.DEFAULT
			
 
				+                )
			
 
				+                Logger.log(
			
 
				+                    step=(self._current_step,),
			
 
				+                    data={"{prefix}.reconstruction_loss".format(prefix=Logger._stage): float(run_values.results["reconstruction_loss"])},
			
 
				+                    verbosity=Logger.Verbosity.DEFAULT
			
 
				+                )
			
 
				+                Logger.log(
			
 
				+                    step=(self._current_step,),
			
 
				+                    data={"{prefix}.total_loss".format(prefix=Logger._stage): float(run_values.results["total_loss"])},
			
 
				+                    verbosity=Logger.Verbosity.DEFAULT
			
 
				+                )
			
 
				+                Logger.log(
			
 
				+                    step=(self._current_step,),
			
 
				+                    data={"{prefix}.learning_rate".format(prefix=Logger._stage): float(run_values.results["learning_rate"])},
			
 
				+                    verbosity=Logger.Verbosity.DEFAULT
			
 
				+                )
			
 
				 
			
 
				             for key, val in sorted(run_values.results["iou_scores"].items(), key=operator.itemgetter(0)):
			
 
				-                LOGGER.log("iou_score - THS %s" % key, float(val))
			
 
				-
			
 
				-            LOGGER.log("True Positives:", run_values.results["confusion_matrix"]["tp"])
			
 
				-            LOGGER.log("True Negatives:", run_values.results["confusion_matrix"]["tn"])
			
 
				-            LOGGER.log("False Positives:", run_values.results["confusion_matrix"]["fp"])
			
 
				-            LOGGER.log("False Negatives:", run_values.results["confusion_matrix"]["fn"])
			
 
				+                Logger.log(
			
 
				+                    step=(self._current_step,),
			
 
				+                    data={"{prefix}.IoU_THS_{ths}".format(prefix=Logger._stage, ths=key): float(val)},
			
 
				+                    verbosity=Logger.Verbosity.DEFAULT
			
 
				+                )
			
 
				+
			
 
				+            Logger.log(
			
 
				+                step=(self._current_step,),
			
 
				+                data={"{prefix}.true_positives".format(prefix=Logger._stage): str(run_values.results["confusion_matrix"]["tp"])},
			
 
				+                verbosity=Logger.Verbosity.DEFAULT
			
 
				+            )
			
 
				+
			
 
				+            Logger.log(
			
 
				+                step=(self._current_step,),
			
 
				+                data={"{prefix}.true_negatives".format(prefix=Logger._stage): str(run_values.results["confusion_matrix"]["tn"])},
			
 
				+                verbosity=Logger.Verbosity.DEFAULT
			
 
				+            )
			
 
				+
			
 
				+            Logger.log(
			
 
				+                step=(self._current_step,),
			
 
				+                data={"{prefix}.false_positives".format(prefix=Logger._stage): str(run_values.results["confusion_matrix"]["fp"])},
			
 
				+                verbosity=Logger.Verbosity.DEFAULT
			
 
				+            )
			
 
				+
			
 
				+            Logger.log(
			
 
				+                step=(self._current_step,),
			
 
				+                data={"{prefix}.false_negatives".format(prefix=Logger._stage): str(run_values.results["confusion_matrix"]["fn"])},
			
 
				+                verbosity=Logger.Verbosity.DEFAULT
			
 
				+            )
			
 
				 
			
 
				             if self._sample_dir is not None and self._is_training:
			
 
				 
			
@@ -203,11 +309,20 @@ class ProfilerHook(tf.train.SessionRunHook):
 
				         total_processing_hours, rem = divmod(total_processing_time, 3600)
			
 
				         total_processing_minutes, total_processing_seconds = divmod(rem, 60)
			
 
				 
			
 
				-        LOGGER.log(
			
 
				-            "Final Summary:\n"
			
 
				-            "\t[*] Average Imgs/sec: %d\n"
			
 
				-            "\t[*] Total Processing Time: %dh %02dm %02ds\n" %
			
 
				-            (avg_processing_speed, total_processing_hours, total_processing_minutes, total_processing_seconds)
			
 
				+        print("\n============== Final Summary ==============")
			
 
				+        Logger.log(
			
 
				+            step=(),
			
 
				+            data={"{prefix}.avg_ips".format(prefix=Logger._stage): avg_processing_speed},
			
 
				+            verbosity=Logger.Verbosity.DEFAULT
			
 
				+        )
			
 
				+        Logger.log(
			
 
				+            step=(),
			
 
				+            data={"{prefix} - Total Processing Time".format(prefix=Logger._stage.capitalize()): "%dh %02dm %02ds" % (
			
 
				+                total_processing_hours,
			
 
				+                total_processing_minutes,
			
 
				+                total_processing_seconds
			
 
				+            )},
			
 
				+            verbosity=Logger.Verbosity.DEFAULT
			
 
				         )
			
 
				 
			
 
				         perf_dict = {'throughput': str(avg_processing_speed), 'processing_time': str(total_processing_time)}
			
--- a/TensorFlow/Segmentation/UNet_Industrial/utils/hvd_utils.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/utils/hvd_utils.py
@@ -25,9 +25,4 @@ __all__ = ["is_using_hvd"]
 
				 
			
 
				 
			
 
				 def is_using_hvd():
			
 
				-    env_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"]
			
 
				-
			
 
				-    if all([var in os.environ for var in env_vars]):
			
 
				-        return True
			
 
				-    else:
			
 
				-        return False
			
 
				+    return True
			
--- a/TensorFlow/Segmentation/UNet_Industrial/utils/logging.py
+++ b/TensorFlow/Segmentation/UNet_Industrial/utils/logging.py
@@ -0,0 +1,50 @@
 
				+# !/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# ==============================================================================
			
 
				+#
			
 
				+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+#
			
 
				+# ==============================================================================
			
 
				+
			
 
				+import dllogger as Logger
			
 
				+
			
 
				+
			
 
				+def format_step(step):
			
 
				+    if isinstance(step, str):
			
 
				+        return step
			
 
				+
			
 
				+    if isinstance(step, int):
			
 
				+        return "Iteration: {} ".format(step)
			
 
				+
			
 
				+    s = ""
			
 
				+
			
 
				+    if len(step) > 0:
			
 
				+        s += "Epoch: {} ".format(step[0])
			
 
				+
			
 
				+    if len(step) > 1:
			
 
				+        s += "Iteration: {} ".format(step[1])
			
 
				+
			
 
				+    if len(step) > 2:
			
 
				+        s += "Validation Iteration: {} ".format(step[2])
			
 
				+
			
 
				+    return s
			
 
				+
			
 
				+
			
 
				+def init_dllogger(log_dir):
			
 
				+    Logger.init([
			
 
				+        Logger.StdOutBackend(Logger.Verbosity.DEFAULT, step_format=format_step),
			
 
				+        Logger.JSONStreamBackend(Logger.Verbosity.VERBOSE, log_dir)
			
 
				+    ])
		`@@ -0,0 +1 @@`
		`+git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc`