5 лет назад · 79d4ced0be
--- a/TensorFlow/Segmentation/UNet_3D_Medical/.dockerignore
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/.dockerignore
@@ -0,0 +1,10 @@
 
				+.ipynb_checkpoints
			
 
				+/_python_build
			
 
				+*.pyc
			
 
				+__pycache__
			
 
				+core
			
 
				+*.swp
			
 
				+/datasets
			
 
				+/results
			
 
				+results
			
 
				+./data
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/.gitignore
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/.gitignore
@@ -0,0 +1,12 @@
 
				+.idea/
			
 
				+*.tar
			
 
				+.ipynb_checkpoints
			
 
				+/_python_build
			
 
				+*.pyc
			
 
				+__pycache__
			
 
				+*.swp
			
 
				+/datasets
			
 
				+/results
			
 
				+results
			
 
				+./data
			
 
				+
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/Dockerfile
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/Dockerfile
@@ -0,0 +1,9 @@
 
				+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
			
 
				+FROM ${FROM_IMAGE_NAME}
			
 
				+
			
 
				+ADD . /workspace/unet3d
			
 
				+WORKDIR /workspace/unet3d
			
 
				+
			
 
				+RUN pip install --upgrade pip
			
 
				+RUN pip install git+https://github.com/NVIDIA/dllogger
			
 
				+RUN pip install --disable-pip-version-check -r requirements.txt
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/LICENSE
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/LICENSE
@@ -0,0 +1,202 @@
 
				+
			
 
				+                                 Apache License
			
 
				+                           Version 2.0, January 2004
			
 
				+                        http://www.apache.org/licenses/
			
 
				+
			
 
				+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
			
 
				+
			
 
				+   1. Definitions.
			
 
				+
			
 
				+      "License" shall mean the terms and conditions for use, reproduction,
			
 
				+      and distribution as defined by Sections 1 through 9 of this document.
			
 
				+
			
 
				+      "Licensor" shall mean the copyright owner or entity authorized by
			
 
				+      the copyright owner that is granting the License.
			
 
				+
			
 
				+      "Legal Entity" shall mean the union of the acting entity and all
			
 
				+      other entities that control, are controlled by, or are under common
			
 
				+      control with that entity. For the purposes of this definition,
			
 
				+      "control" means (i) the power, direct or indirect, to cause the
			
 
				+      direction or management of such entity, whether by contract or
			
 
				+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
			
 
				+      outstanding shares, or (iii) beneficial ownership of such entity.
			
 
				+
			
 
				+      "You" (or "Your") shall mean an individual or Legal Entity
			
 
				+      exercising permissions granted by this License.
			
 
				+
			
 
				+      "Source" form shall mean the preferred form for making modifications,
			
 
				+      including but not limited to software source code, documentation
			
 
				+      source, and configuration files.
			
 
				+
			
 
				+      "Object" form shall mean any form resulting from mechanical
			
 
				+      transformation or translation of a Source form, including but
			
 
				+      not limited to compiled object code, generated documentation,
			
 
				+      and conversions to other media types.
			
 
				+
			
 
				+      "Work" shall mean the work of authorship, whether in Source or
			
 
				+      Object form, made available under the License, as indicated by a
			
 
				+      copyright notice that is included in or attached to the work
			
 
				+      (an example is provided in the Appendix below).
			
 
				+
			
 
				+      "Derivative Works" shall mean any work, whether in Source or Object
			
 
				+      form, that is based on (or derived from) the Work and for which the
			
 
				+      editorial revisions, annotations, elaborations, or other modifications
			
 
				+      represent, as a whole, an original work of authorship. For the purposes
			
 
				+      of this License, Derivative Works shall not include works that remain
			
 
				+      separable from, or merely link (or bind by name) to the interfaces of,
			
 
				+      the Work and Derivative Works thereof.
			
 
				+
			
 
				+      "Contribution" shall mean any work of authorship, including
			
 
				+      the original version of the Work and any modifications or additions
			
 
				+      to that Work or Derivative Works thereof, that is intentionally
			
 
				+      submitted to Licensor for inclusion in the Work by the copyright owner
			
 
				+      or by an individual or Legal Entity authorized to submit on behalf of
			
 
				+      the copyright owner. For the purposes of this definition, "submitted"
			
 
				+      means any form of electronic, verbal, or written communication sent
			
 
				+      to the Licensor or its representatives, including but not limited to
			
 
				+      communication on electronic mailing lists, source code control systems,
			
 
				+      and issue tracking systems that are managed by, or on behalf of, the
			
 
				+      Licensor for the purpose of discussing and improving the Work, but
			
 
				+      excluding communication that is conspicuously marked or otherwise
			
 
				+      designated in writing by the copyright owner as "Not a Contribution."
			
 
				+
			
 
				+      "Contributor" shall mean Licensor and any individual or Legal Entity
			
 
				+      on behalf of whom a Contribution has been received by Licensor and
			
 
				+      subsequently incorporated within the Work.
			
 
				+
			
 
				+   2. Grant of Copyright License. Subject to the terms and conditions of
			
 
				+      this License, each Contributor hereby grants to You a perpetual,
			
 
				+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				+      copyright license to reproduce, prepare Derivative Works of,
			
 
				+      publicly display, publicly perform, sublicense, and distribute the
			
 
				+      Work and such Derivative Works in Source or Object form.
			
 
				+
			
 
				+   3. Grant of Patent License. Subject to the terms and conditions of
			
 
				+      this License, each Contributor hereby grants to You a perpetual,
			
 
				+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				+      (except as stated in this section) patent license to make, have made,
			
 
				+      use, offer to sell, sell, import, and otherwise transfer the Work,
			
 
				+      where such license applies only to those patent claims licensable
			
 
				+      by such Contributor that are necessarily infringed by their
			
 
				+      Contribution(s) alone or by combination of their Contribution(s)
			
 
				+      with the Work to which such Contribution(s) was submitted. If You
			
 
				+      institute patent litigation against any entity (including a
			
 
				+      cross-claim or counterclaim in a lawsuit) alleging that the Work
			
 
				+      or a Contribution incorporated within the Work constitutes direct
			
 
				+      or contributory patent infringement, then any patent licenses
			
 
				+      granted to You under this License for that Work shall terminate
			
 
				+      as of the date such litigation is filed.
			
 
				+
			
 
				+   4. Redistribution. You may reproduce and distribute copies of the
			
 
				+      Work or Derivative Works thereof in any medium, with or without
			
 
				+      modifications, and in Source or Object form, provided that You
			
 
				+      meet the following conditions:
			
 
				+
			
 
				+      (a) You must give any other recipients of the Work or
			
 
				+          Derivative Works a copy of this License; and
			
 
				+
			
 
				+      (b) You must cause any modified files to carry prominent notices
			
 
				+          stating that You changed the files; and
			
 
				+
			
 
				+      (c) You must retain, in the Source form of any Derivative Works
			
 
				+          that You distribute, all copyright, patent, trademark, and
			
 
				+          attribution notices from the Source form of the Work,
			
 
				+          excluding those notices that do not pertain to any part of
			
 
				+          the Derivative Works; and
			
 
				+
			
 
				+      (d) If the Work includes a "NOTICE" text file as part of its
			
 
				+          distribution, then any Derivative Works that You distribute must
			
 
				+          include a readable copy of the attribution notices contained
			
 
				+          within such NOTICE file, excluding those notices that do not
			
 
				+          pertain to any part of the Derivative Works, in at least one
			
 
				+          of the following places: within a NOTICE text file distributed
			
 
				+          as part of the Derivative Works; within the Source form or
			
 
				+          documentation, if provided along with the Derivative Works; or,
			
 
				+          within a display generated by the Derivative Works, if and
			
 
				+          wherever such third-party notices normally appear. The contents
			
 
				+          of the NOTICE file are for informational purposes only and
			
 
				+          do not modify the License. You may add Your own attribution
			
 
				+          notices within Derivative Works that You distribute, alongside
			
 
				+          or as an addendum to the NOTICE text from the Work, provided
			
 
				+          that such additional attribution notices cannot be construed
			
 
				+          as modifying the License.
			
 
				+
			
 
				+      You may add Your own copyright statement to Your modifications and
			
 
				+      may provide additional or different license terms and conditions
			
 
				+      for use, reproduction, or distribution of Your modifications, or
			
 
				+      for any such Derivative Works as a whole, provided Your use,
			
 
				+      reproduction, and distribution of the Work otherwise complies with
			
 
				+      the conditions stated in this License.
			
 
				+
			
 
				+   5. Submission of Contributions. Unless You explicitly state otherwise,
			
 
				+      any Contribution intentionally submitted for inclusion in the Work
			
 
				+      by You to the Licensor shall be under the terms and conditions of
			
 
				+      this License, without any additional terms or conditions.
			
 
				+      Notwithstanding the above, nothing herein shall supersede or modify
			
 
				+      the terms of any separate license agreement you may have executed
			
 
				+      with Licensor regarding such Contributions.
			
 
				+
			
 
				+   6. Trademarks. This License does not grant permission to use the trade
			
 
				+      names, trademarks, service marks, or product names of the Licensor,
			
 
				+      except as required for reasonable and customary use in describing the
			
 
				+      origin of the Work and reproducing the content of the NOTICE file.
			
 
				+
			
 
				+   7. Disclaimer of Warranty. Unless required by applicable law or
			
 
				+      agreed to in writing, Licensor provides the Work (and each
			
 
				+      Contributor provides its Contributions) on an "AS IS" BASIS,
			
 
				+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
			
 
				+      implied, including, without limitation, any warranties or conditions
			
 
				+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
			
 
				+      PARTICULAR PURPOSE. You are solely responsible for determining the
			
 
				+      appropriateness of using or redistributing the Work and assume any
			
 
				+      risks associated with Your exercise of permissions under this License.
			
 
				+
			
 
				+   8. Limitation of Liability. In no event and under no legal theory,
			
 
				+      whether in tort (including negligence), contract, or otherwise,
			
 
				+      unless required by applicable law (such as deliberate and grossly
			
 
				+      negligent acts) or agreed to in writing, shall any Contributor be
			
 
				+      liable to You for damages, including any direct, indirect, special,
			
 
				+      incidental, or consequential damages of any character arising as a
			
 
				+      result of this License or out of the use or inability to use the
			
 
				+      Work (including but not limited to damages for loss of goodwill,
			
 
				+      work stoppage, computer failure or malfunction, or any and all
			
 
				+      other commercial damages or losses), even if such Contributor
			
 
				+      has been advised of the possibility of such damages.
			
 
				+
			
 
				+   9. Accepting Warranty or Additional Liability. While redistributing
			
 
				+      the Work or Derivative Works thereof, You may choose to offer,
			
 
				+      and charge a fee for, acceptance of support, warranty, indemnity,
			
 
				+      or other liability obligations and/or rights consistent with this
			
 
				+      License. However, in accepting such obligations, You may act only
			
 
				+      on Your own behalf and on Your sole responsibility, not on behalf
			
 
				+      of any other Contributor, and only if You agree to indemnify,
			
 
				+      defend, and hold each Contributor harmless for any liability
			
 
				+      incurred by, or claims asserted against, such Contributor by reason
			
 
				+      of your accepting any such warranty or additional liability.
			
 
				+
			
 
				+   END OF TERMS AND CONDITIONS
			
 
				+
			
 
				+   APPENDIX: How to apply the Apache License to your work.
			
 
				+
			
 
				+      To apply the Apache License to your work, attach the following
			
 
				+      boilerplate notice, with the fields enclosed by brackets "[]"
			
 
				+      replaced with your own identifying information. (Don't include
			
 
				+      the brackets!)  The text should be enclosed in the appropriate
			
 
				+      comment syntax for the file format. We also recommend that a
			
 
				+      file or class name and description of purpose be included on the
			
 
				+      same "printed page" as the copyright notice for easier
			
 
				+      identification within third-party archives.
			
 
				+
			
 
				+   Copyright [yyyy] [name of copyright owner]
			
 
				+
			
 
				+   Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+   you may not use this file except in compliance with the License.
			
 
				+   You may obtain a copy of the License at
			
 
				+
			
 
				+       http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+   Unless required by applicable law or agreed to in writing, software
			
 
				+   distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+   See the License for the specific language governing permissions and
			
 
				+   limitations under the License.
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/README.md
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/README.md
@@ -0,0 +1,589 @@
 
				+# 3D-UNet Medical Image Segmentation for TensorFlow 1.x
			
 
				+ 
			
 
				+This repository provides a script and recipe to train 3D-UNet to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
			
 
				+ 
			
 
				+## Table of Contents
			
 
				+ 
			
 
				+- [Model overview](#model-overview)
			
 
				+   * [Model architecture](#model-architecture)
			
 
				+   * [Default configuration](#default-configuration)
			
 
				+   * [Feature support matrix](#feature-support-matrix)
			
 
				+     * [Features](#features)
			
 
				+   * [Mixed precision training](#mixed-precision-training)
			
 
				+     * [Enabling mixed precision](#enabling-mixed-precision)
			
 
				+     * [Enabling TF32](#enabling-tf32)
			
 
				+- [Setup](#setup)
			
 
				+   * [Requirements](#requirements)
			
 
				+- [Quick Start Guide](#quick-start-guide)
			
 
				+- [Advanced](#advanced)
			
 
				+   * [Scripts and sample code](#scripts-and-sample-code)
			
 
				+   * [Parameters](#parameters)
			
 
				+   * [Command-line options](#command-line-options)
			
 
				+   * [Getting the data](#getting-the-data)
			
 
				+     * [Dataset guidelines](#dataset-guidelines)
			
 
				+     * [Multi-dataset](#multi-dataset)
			
 
				+   * [Training process](#training-process)
			
 
				+   * [Inference process](#inference-process)
			
 
				+- [Performance](#performance)   
			
 
				+   * [Benchmarking](#benchmarking)
			
 
				+     * [Training performance benchmark](#training-performance-benchmark)
			
 
				+     * [Inference performance benchmark](#inference-performance-benchmark)
			
 
				+   * [Results](#results)
			
 
				+     * [Training accuracy results](#training-accuracy-results) 
			
 
				+       * [Training accuracy: NVIDIA DGX-1 (8x V100 32GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
			
 
				+     * [Training performance results](#training-performance-results)
			
 
				+       * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
			
 
				+       * [Training performance: NVIDIA DGX-1 (8x V100 32GB)](#training-performance-nvidia-dgx-1-8x-v100-32gb)
			
 
				+     * [Inference performance results](#inference-performance-results)
			
 
				+        * [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb)
			
 
				+        * [Inference performance: NVIDIA DGX-1 (1x V100 32GB)](#inference-performance-nvidia-dgx-1-1x-v100-32gb)
			
 
				+- [Release notes](#release-notes)
			
 
				+   * [Changelog](#changelog)
			
 
				+   * [Known issues](#known-issues)
			
 
				+
			
 
				+ 
			
 
				+## Model overview
			
 
				+ 
			
 
				+The U-Net model is a convolutional neural network for 3D image segmentation. This repository contains a 3D-UNet implementation introduced in [3D U-Net: Learning Dense Volumetric Segmentation from Sparse Annotation](https://arxiv.org/pdf/1606.06650), with modifications described in [No New-Net](https://arxiv.org/pdf/1809.10483).
			
 
				+ 
			
 
				+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results up to 2.3x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
			
 
				+
			
 
				+
			
 
				+### Model architecture
			
 
				+ 
			
 
				+3D-UNet was first introduced by Olaf Ronneberger, Philip Fischer, and Thomas Brox in the paper: [3D U-Net: Learning Dense Volumetric Segmentation from Sparse Annotation](https://arxiv.org/pdf/1606.06650). In this repository we host a 3D-UNet version adapted by Fabian Isensee et al. to brain tumor segmentation. 3D-UNet allows for seamless segmentation of 3D volumes, with high accuracy and performance, and can be adapted to solve many different segmentation problems.
			
 
				+ 
			
 
				+The following figure shows the construction of the 3D-UNet model and its different components. 3D-UNet is composed of a contractive and an expanding path, that aims at building a bottleneck in its centermost part through a combination of convolution and pooling operations. After this bottleneck, the image is reconstructed through a combination of convolutions and upsampling. Skip connections are added with the goal of helping the backward flow of gradients in order to improve the training.
			
 
				+ 
			
 
				+![U-Net3D](images/unet3d.png)
			
 
				+ 
			
 
				+### Default configuration
			
 
				+ 
			
 
				+3D-UNet consists of a contractive (left-side) and expanding (right-side) path. It repeatedly applies unpadded convolutions followed by max pooling for downsampling. Every step in the expanding path consists of an upsampling of the feature maps and a concatenation with the correspondingly cropped feature map from the contractive path.
			
 
				+ 
			
 
				+### Feature support matrix
			
 
				+ 
			
 
				+The following features are supported by this model.
			
 
				+ 
			
 
				+| **Feature** | **3D-UNet** |
			
 
				+|---------------------------------|-----|
			
 
				+| Automatic mixed precision (AMP) | Yes |
			
 
				+| Horovod Multi-GPU (NCCL)        | Yes |
			
 
				+| Accelerated Linear Algebra (XLA)| Yes |
			
 
				+ 
			
 
				+#### Features
			
 
				+ 
			
 
				+**Automatic Mixed Precision (AMP)**
			
 
				+ 
			
 
				+This implementation of 3D-UNet uses AMP to implement mixed precision training. Computation graphs can be modified by TensorFlow on runtime to support mixed precision training. Detailed explanation of mixed precision can be found in the next section.
			
 
				+ 
			
 
				+**Horovod**
			
 
				+ 
			
 
				+Horovod is a distributed training framework for TensorFlow, Keras, PyTorch, and MXNet. The goal of Horovod is to make distributed deep learning fast and easy to use. For more information about how to get started with Horovod, see the [Horovod: Official repository](https://github.com/horovod/horovod).
			
 
				+ 
			
 
				+Multi-GPU training with Horovod
			
 
				+ 
			
 
				+Our model uses Horovod to implement efficient multi-GPU training with NCCL. For details, see example sources in this repository or see the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage).
			
 
				+ 
			
 
				+**XLA support (experimental)**
			
 
				+ 
			
 
				+XLA is a domain-specific compiler for linear algebra that can accelerate TensorFlow models with potentially no source code changes. The results are improvements in speed and memory usage: most internal benchmarks run ~1.1-1.5x faster after XLA is enabled.
			
 
				+ 
			
 
				+### Mixed precision training
			
 
				+ 
			
 
				+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) previously required two steps:
			
 
				+1.  Porting the model to use the FP16 data type where appropriate.    
			
 
				+2.  Adding loss scaling to preserve small gradient values.
			
 
				+
			
 
				+This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code.  AMP enables mixed precision training on Volta, Turing, and NVIDIA Ampere GPU architectures automatically. The TensorFlow framework code makes all necessary model changes internally.
			
 
				+
			
 
				+In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing tf.contrib loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
			
 
				+
			
 
				+For information about:
			
 
				+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
			
 
				+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
			
 
				+-   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
			
 
				+
			
 
				+#### Enabling mixed precision
			
 
				+ 
			
 
				+Mixed precision is enabled in TensorFlow by using the Automatic Mixed Precision (TF-AMP) extension which casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In TensorFlow, loss scaling can be applied statically by using simple multiplication of loss by a constant value or automatically, by TF-AMP. Automatic mixed precision makes all the adjustments internally in TensorFlow, providing two benefits over manual operations. First, programmers need not modify network model code, reducing development and maintenance effort. Second, using AMP maintains forward and backward compatibility with all the APIs for defining and running TensorFlow models.
			
 
				+
			
 
				+To enable mixed precision, you can simply add the values to the environmental variable inside your training script:
			
 
				+  ```
			
 
				+  os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
			
 
				+  ```
			
 
				+Exporting these variables ensures that loss scaling is performed correctly and automatically.
			
 
				+By supplying the `--amp` flag to the `main.py` script while training in FP32/TF32, the following variables are set to their correct value for mixed precision training:
			
 
				+```
			
 
				+if params.amp:
			
 
				+  os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
			
 
				+```
			
 
				+
			
 
				+
			
 
				+ #### Enabling TF32
			
 
				+
			
 
				+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](#https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
			
 
				+
			
 
				+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
			
 
				+
			
 
				+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](#https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
			
 
				+
			
 
				+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
			
 
				+ 
			
 
				+## Setup
			
 
				+ 
			
 
				+The following section lists the requirements that you need to meet in order to start training the 3D-UNet model.
			
 
				+ 
			
 
				+### Requirements
			
 
				+ 
			
 
				+This repository contains Dockerfile which extends the TensorFlow NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
			
 
				+- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
			
 
				+- TensorFlow 20.06-tf1-py3 [NGC container](https://ngc.nvidia.com/registry/nvidia-tensorflow)
			
 
				+-   GPU-based architecture:
			
 
				+    - [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
			
 
				+    - [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/)
			
 
				+    - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
			
 
				+
			
 
				+ 
			
 
				+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
			
 
				+- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
			
 
				+- [Accessing And Pulling From The NGC container registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
			
 
				+- [Running TensorFlow](https://docs.nvidia.com/deeplearning/dgx/tensorflow-release-notes/running.html#running)
			
 
				+ 
			
 
				+For those unable to use the TensorFlow NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
			
 
				+
			
 
				+
			
 
				+## Quick Start Guide
			
 
				+ 
			
 
				+To train your model using mixed or TF32 precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the 3D-UNet model on the [Brain Tumor Segmentation 2019](https://www.med.upenn.edu/cbica/brats-2019/) dataset. These steps enable you to build the 3D-UNet TensorFlow NGC container, train and evaluate your model, and generate predictions on the test data. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
			
 
				+ 
			
 
				+ 
			
 
				+1. Clone the repository.
			
 
				+ 
			
 
				+    Executing this command will create your local repository with all the code to run 3D-UNet.
			
 
				+    
			
 
				+    ```bash
			
 
				+    git clone https://github.com/NVIDIA/DeepLearningExamples
			
 
				+    cd DeepLearningExamples/TensorFlow/Segmentation/U-Net3D_TF
			
 
				+ 
			
 
				+2. Build the U-Net TensorFlow NGC container.
			
 
				+ 
			
 
				+    This command will use the `Dockerfile` to create a Docker image named `unet3d_tf`, downloading all the required components automatically.
			
 
				+    
			
 
				+    ```bash
			
 
				+    docker build -t unet3d_tf .
			
 
				+    ```
			
 
				+    
			
 
				+    The NGC container contains all the components optimized for usage on NVIDIA hardware.
			
 
				+ 
			
 
				+3. Start an interactive session in the NGC container to run preprocessing/training/inference.
			
 
				+ 
			
 
				+    The following command will launch the container and mount the `./data` directory as a volume to the `/data` directory inside the container, and `./results` directory to the `/results` directory in the container.
			
 
				+    
			
 
				+    ```bash
			
 
				+    mkdir data
			
 
				+    mkdir results
			
 
				+    docker run --runtime=nvidia -it --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --rm --ipc=host -v ${PWD}/data:/data -v ${PWD}/results:/results unet3d_tf:latest /bin/bash
			
 
				+    ```
			
 
				+    
			
 
				+    Any datasets and experiment results (logs, checkpoints, etc.) saved to `/data` or `/results` will be accessible
			
 
				+    in the `./data` or `./results` directory on the host, respectively.
			
 
				+ 
			
 
				+4. Download and pre-process the data.
			
 
				+   
			
 
				+    Data can be obtained by registering on [Brain Tumor Segmentation 2019 dataset](https://www.med.upenn.edu/cbica/brats-2019/) website. The data should be downloaded and placed where `/data` in the container is mounted. The `dataset/preprocess_data.py` script will convert the raw data into tfrecord format used for training and evaluation.
			
 
				+
			
 
				+    The script can be launched as 
			
 
				+    
			
 
				+    ```python
			
 
				+    python dataset/preprocess_data.py -i /data/<name/of/the/raw/data/folder> -o /data/<name/of/the/preprocessed/data/folder> -v
			
 
				+    ```
			
 
				+ 
			
 
				+5. Start training.
			
 
				+  
			
 
				+    After the Docker container is launched, the training of a single fold (fold 0) with the [default hyperparameters](#default-parameters) (for example 1/8 GPUs TF-AMP/FP32/TF32) can be started with:
			
 
				+    
			
 
				+    ```bash
			
 
				+    bash examples/unet3d_train_single{_TF-AMP}.sh <number/of/gpus> <path/to/dataset> <path/to/checkpoint> <batch/size>
			
 
				+    ```
			
 
				+    
			
 
				+    For example, to run with 32-bit precision (FP32 or TF32) with batch size 2 on 1 GPU, simply use:
			
 
				+    
			
 
				+    ```bash
			
 
				+    bash examples/unet3d_train_single.sh 1 /data/preprocessed /results 2
			
 
				+    ```
			
 
				+    
			
 
				+    to train a single fold with mixed precision (TF-AMP) with on 8 GPUs batch size 2 per GPU, use:
			
 
				+    
			
 
				+    ```bash
			
 
				+    bash examples/unet3d_train_single_TF-AMP.sh 8 /data/preprocessed /results 2
			
 
				+    ```
			
 
				+    The obtained dice scores will be reported after the training has finished.
			
 
				+ 
			
 
				+6. Start benchmarking.
			
 
				+  
			
 
				+    The training performance can be evaluated by using benchmarking scripts, such as:
			
 
				+    
			
 
				+    ```bash
			
 
				+    bash examples/unet3d_{train,infer}_benchmark{_TF-AMP}.sh <number/of/gpus/for/training> <path/to/dataset> <path/to/checkpoint> <batch/size>
			
 
				+    ```
			
 
				+    
			
 
				+    which will make the model run and report the performance. For example, to benchmark training with TF-AMP with batch size 2 on 4 GPUs, use:
			
 
				+    
			
 
				+    ```bash
			
 
				+    bash examples/unet3d_train_benchmark_TF-AMP.sh 4 /data/preprocessed /results 2
			
 
				+    ```
			
 
				+    
			
 
				+    to obtain inference performance with 32-bit precision (FP32 or TF32) with batch size 1, use:
			
 
				+    
			
 
				+    ```bash
			
 
				+    bash examples/unet3d_infer_benchmark.sh /data/preprocessed /results 1
			
 
				+    ```
			
 
				+
			
 
				+## Advanced
			
 
				+ 
			
 
				+The following sections provide greater details of the dataset, running training and inference, and the training results.
			
 
				+ 
			
 
				+### Scripts and sample code
			
 
				+ 
			
 
				+In the root directory, the most important files are:
			
 
				+* `main.py`: Serves as the entry point to the application. Encapsulates the training routine.
			
 
				+* `Dockerfile`: Container with the basic set of dependencies to run U-Net.
			
 
				+* `requirements.txt`: Set of extra requirements for running U-Net.
			
 
				+* `preprocess_data.py`: Converts the dataset to tfrecord format for training.
			
 
				+ 
			
 
				+The `dataset/` folder contains the necessary tools to train and perform inference using U-Net. Its main components are:
			
 
				+* `data_loader.py`: Implements the data loading and augmentation.
			
 
				+* `transforms.py`: Implements the data augmentation functions.
			
 
				+* `preprocess_data.py`: Implements the data conversion and pre-processing functionality.
			
 
				+ 
			
 
				+The `runtime/` folder contains scripts with training and inference logic. Its contents are:
			
 
				+* `arguments.py`: Implements the command-line arguments parsing.
			
 
				+* `hooks.py`: Collects different metrics to be used for benchmarking and testing.
			
 
				+* `parse_results.py`: Defines a set of functions used for parsing the partial results.
			
 
				+* `setup.py`: Defines a set of functions to set the environment up.
			
 
				+ 
			
 
				+ The `model/` folder contains information about the building blocks of 3D-UNet and the way they are assembled. Its contents are:
			
 
				+* `layers.py`: Defines the different blocks that are used to assemble 3D-UNet.
			
 
				+* `losses.py`: Defines the different losses used during training and evaluation.
			
 
				+* `model_fn.py`: Defines the computational graph to optimize.
			
 
				+* `unet3d.py`: Defines the model architecture using the blocks from the `layers.py` file.
			
 
				+
			
 
				+Other folders included in the root directory are:
			
 
				+* `examples/`: Provides examples for training and benchmarking U-Net
			
 
				+* `images/`: Contains the model diagram
			
 
				+ 
			
 
				+### Parameters
			
 
				+ 
			
 
				+The complete list of the available parameters for the main.py script contains:
			
 
				+* `--exec_mode`: Select the execution mode to run the model (default: `train`). Modes available:
			
 
				+  * `train` - trains a model and stores checkpoints in the directory passed using `--model_dir`
			
 
				+  * `evaluate` - loads checkpoint (if available) and performs evaluation on validation subset (requires `--fold` other than `None`).
			
 
				+  * `train_and_evaluate` - trains model from scratch and performs validation at the end (requires `--fold` other than `None`).
			
 
				+  * `predict` - loads checkpoint (if available) and runs inference on the test set. Stores the results in the `--model_dir` directory.
			
 
				+  * `train_and_predict` - trains model from scratch and performs inference.
			
 
				+* `--model_dir`: Set the output directory for information related to the model (default: `/results`).
			
 
				+* `--log_dir`: Set the output directory for logs (default: None).
			
 
				+* `--data_dir`: Set the input directory containing the dataset (default: `None`).
			
 
				+* `--batch_size`: Size of each minibatch per GPU (default: `1`).
			
 
				+* `--fold`: Selected fold for cross-validation (default: `None`).
			
 
				+* `--max_steps`: Maximum number of steps (batches) for training (default: `16000`).
			
 
				+* `--seed`: Set random seed for reproducibility (default: `0`).
			
 
				+* `--log_every`: Log performance every n steps (default: `100`).
			
 
				+* `--learning_rate`: Model’s learning rate (default: `0.0002`).
			
 
				+* `--augment`: Enable data augmentation (disabled by default).
			
 
				+* `--benchmark`: Enable performance benchmarking (disabled by default). If the flag is set, the script runs in a benchmark mode - each iteration is timed and the performance result (in images per second) is printed at the end. Works for both `train` and `predict` execution modes.
			
 
				+* `--warmup_steps`: Used during benchmarking - the number of steps to skip (default: `200`). First iterations are usually much slower since the graph is being constructed. Skipping the initial iterations is required for a fair performance assessment.
			
 
				+* `--resume_training`: Whether to resume training from a checkpoint, if there is one (disabled by default)
			
 
				+* `--xla`: Enable accelerated linear algebra optimization (disabled by default).
			
 
				+* `--amp`: Enable automatic mixed precision (disabled by default).
			
 
				+ 
			
 
				+### Command line options
			
 
				+ 
			
 
				+To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
			
 
				+```bash
			
 
				+python main.py --help
			
 
				+```
			
 
				+ 
			
 
				+The following example output is printed when running the model:
			
 
				+```python main.py --help
			
 
				+usage: main.py [-h] --data_dir DATA_DIR --model_dir MODEL_DIR
			
 
				+               [--exec_mode {train,evaluate,train_and_evaluate,predict}]
			
 
				+               [--benchmark] [--max_steps MAX_STEPS]
			
 
				+               [--learning_rate LEARNING_RATE] [--log_every LOG_EVERY]
			
 
				+               [--log_dir LOG_DIR] [--loss {dice,ce,dice+ce}]
			
 
				+               [--warmup_steps WARMUP_STEPS][--resume_training] 
			
 
				+               [--augment] [--batch_size BATCH_SIZE] [--fold FOLD] 
			
 
				+               [--amp] [--xla]
			
 
				+ 
			
 
				+UNet-3D
			
 
				+ 
			
 
				+optional arguments:
			
 
				+ -h, --help            show this help message and exit
			
 
				+ --model_dir MODEL_DIR
			
 
				+                       Output directory for information related to the model
			
 
				+ --data_dir DATA_DIR   Input directory containing the dataset for training
			
 
				+                       the model
			
 
				+ --exec_mode {train,train_and_predict,predict,evaluate,train_and_evaluate}
			
 
				+                       Execution mode of running the model
			
 
				+ --log_dir LOG_DIR     Output directory for training logs
			
 
				+ --batch_size BATCH_SIZE
			
 
				+                       Size of each minibatch per GPU
			
 
				+ --learning_rate LEARNING_RATE
			
 
				+                       Learning rate coefficient for AdamOptimizer
			
 
				+ --fold                Fold number
			
 
				+                       Chosen fold for cross-validation. Use None to disable
			
 
				+                       cross-validation
			
 
				+ --max_steps MAX_STEPS
			
 
				+                       Maximum number of steps (batches) used for training
			
 
				+ --log_every LOG_EVERY
			
 
				+                       Log performance every n steps
			
 
				+ --warmup_steps WARMUP_STEPS
			
 
				+                       Number of warmup steps
			
 
				+ --resume_training     Whether to resume training from the checkpoint
			
 
				+ --seed SEED           Random seed
			
 
				+ --augment             Perform data augmentation during training
			
 
				+ --benchmark           Collect performance metrics during training
			
 
				+ --amp                 Train using TF-AMP
			
 
				+ --xla                 Train using XLA
			
 
				+```
			
 
				+ 
			
 
				+The 3D-UNet model was trained in the [Brain Tumor Segmentation 2019 dataset](https://www.med.upenn.edu/cbica/brats-2019/). Test images provided by the organization were used to produce the resulting masks for submission. Upon registration, the challenge's data is made available through the https//ipp.cbica.upenn.edu service.
			
 
				+ 
			
 
				+The dataset consists of 335 240x240x155 `nifti` volumes. Each volume is represented by 4 modalities and a corresponding segmentation mask. 
			
 
				+The modalities are:
			
 
				+* Native T1-weighted (T1),
			
 
				+* Post-contrast T1-weighted (T1Gd),
			
 
				+* Native T2-weighted (T2),
			
 
				+* T2 Fluid Attenuated Inversion Recovery (FLAIR).
			
 
				+
			
 
				+Each voxel in a segmentation mask belongs to one of four classes:
			
 
				+* 0 corresponds to healthy tissue or background,
			
 
				+* 1 indicates the presence of the necrotic and non-enhancing tumor core (TC),
			
 
				+* 2 indicates the presence of the peritumoral edema (ED),
			
 
				+* 4 indicates the presence of the GD-enhancing tumor (ET).
			
 
				+ 
			
 
				+The objective is to produce a set of masks that segment the data as accurately as possible. The results are expected to be submitted as a 12-bit `nifti` 3D image, with values corresponding to the underlying class.
			
 
				+ 
			
 
				+#### Dataset guidelines
			
 
				+ 
			
 
				+The training and test datasets are given as 3D `nifti` volumes that can be read using the Nibabel library and NumPy (both packages are installed by the `Dockerfile`).
			
 
				+ 
			
 
				+Initially, all modalities are loaded, stacked and converted into 240x240x155x4 NumPy arrays using Nibabel. To decrease the size of the dataset, each volume is clipped to 85% of the maximal value, normalized to 255 for each modality separately, casted to 8-bit, grouped by 4 volumes, and saved as a `tfrecord` file. The process of converting from `nifti` to `tfrecord` can be found in the `preprocess_data.py` script.
			
 
				+ 
			
 
				+The `tfrecord` files are fed to the model through `tf.data.TFRecordDataset()` to achieve high performance.
			
 
				+ 
			
 
				+The foreground voxel intensities then z-score normalized, whereas labels are one-hot encoded for their later use in dice or pixel-wise cross-entropy loss, becoming 240x240x155x4 tensors.
			
 
				+ 
			
 
				+If augmentation is enabled, the following set of augmentation techniques are applied:
			
 
				+* Random horizontal flipping
			
 
				+* Random 128x128x128x4 crop
			
 
				+* Random brightness shifting
			
 
				+ 
			
 
				+In addition, random vertical flip and random gamma correction augmentations were implemented, but are not used. The process of loading, normalizing and augmenting the data contained in the dataset can be found in the `data_loader.py` script.
			
 
				+ 
			
 
				+#### Multi-dataset
			
 
				+ 
			
 
				+This implementation is tuned for the Brain Tumor Segmentation 2019 dataset. Using other datasets is possible, but might require changes to the code (data loader) and tuning some hyperparameters (e.g. learning rate, number of iterations).
			
 
				+ 
			
 
				+In the current implementation, the data loader works with tfrecord files. It should work seamlessly with any dataset containing 3D data stored in tfrecord format, as long as features (with corresponding mean and standard deviation) and labels are stored as bytestream in the same file as `X`, `Y`, `mean`, and `stdev`.  See the data pre-processing script for details. If your data is stored in a different format, you will have to modify the parsing function in the `dataset/data_loader.py` file. For a walk-through, check the [TensorFlow tf.data API guide](https://www.tensorflow.org/guide/data_performance)
			
 
				+ 
			
 
				+## Training process
			
 
				+ 
			
 
				+The model trains for a total 16,000 (16,000 / number of GPUs) iterations for each fold, with the default 3D-UNet setup:
			
 
				+* Adam optimizer with learning rate of 0.0002.
			
 
				+* Training and evaluation batch size of 2.
			
 
				+ 
			
 
				+This default parametrization is applied when running scripts from the `./examples` directory and when running `main.py` without explicitly overriding these parameters. By default, the training is in full precision. To enable AMP, pass the `--amp` flag. AMP can be enabled for every mode of execution.
			
 
				+ 
			
 
				+The default configuration minimizes a function _L = 1 - DICE + cross entropy_ during training and reports achieved convergence as dice score per class, mean dice score, and dice score for whole tumor vs background. The training with a combination of dice and cross entropy has been proven to achieve better convergence than a training using only dice.
			
 
				+ 
			
 
				+The training can be run directly without using the predefined scripts. The name of the training script is `main.py`. Because of the multi-GPU support, training should always be run with the Horovod distributed launcher like this:
			
 
				+```bash
			
 
				+horovodrun -np <number/of/gpus> python main.py --data_dir /data/preprocessed --exec_mode train [other parameters]
			
 
				+```
			
 
				+ 
			
 
				+*Note:* When calling the `main.py` script manually, data augmentation is disabled. In order to enable data augmentation, use the `--augment` flag in your invocation.
			
 
				+ 
			
 
				+The main result of the training are checkpoints stored by default in `./results/` directory on the host machine, and in the `/results` directory in the container. This location can be controlled
			
 
				+by the `--model_dir` command-line argument, if a different location was mounted while starting the container. In the case when the training is run in `train_and_predict` mode, the inference will take place after the training is finished, and inference results will be stored to the `/results` directory.
			
 
				+ 
			
 
				+If the `--exec_mode train_and_evaluate` parameter was used, and if `--fold` parameter is set to an integer value of {0, 1, 2, 3, 4}, the evaluation of the validation set takes place after the training is completed. The results of the evaluation will be printed to the console.
			
 
				+
			
 
				+### Inference process
			
 
				+ 
			
 
				+Inference can be launched with the same script used for training by passing the `--exec_mode predict` flag:
			
 
				+```bash
			
 
				+python main.py --exec_mode predict --data_dir /data/preprocessed --model_dir <path/to/checkpoint> [other parameters]
			
 
				+```
			
 
				+ 
			
 
				+The script will then:
			
 
				+* Load the checkpoint from the directory specified by the `<path/to/checkpoint>` directory (`/results`),
			
 
				+* Run inference on the test dataset,
			
 
				+* Save the resulting masks in the `numpy` format in the `--model_dir` directory.
			
 
				+ 
			
 
				+## Performance
			
 
				+ 
			
 
				+### Benchmarking
			
 
				+ 
			
 
				+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
			
 
				+ 
			
 
				+#### Training performance benchmark
			
 
				+ 
			
 
				+To benchmark training, run one of the `train_benchmark` scripts in `./examples/`:
			
 
				+```bash
			
 
				+bash examples/unet3d_train_benchmark{_TF-AMP}.sh <num/of/gpus> <path/to/dataset> <path/to/checkpoints> <batch/size>
			
 
				+```
			
 
				+For example, to benchmark training using mixed-precision on 4 GPUs with batch size of 2 use:
			
 
				+```bash
			
 
				+bash examples/unet3d_train_benchmark_TF-AMP.sh 4 <path/to/dataset> <path/to/checkpoints> 2
			
 
				+```
			
 
				+ 
			
 
				+Each of these scripts will by default run 40 warm-up iterations and benchmark the performance during training in the next 40 iterations.
			
 
				+ 
			
 
				+To have more control, you can run the script by directly providing all relevant run parameters. For example:
			
 
				+```bash
			
 
				+horovodrun -np <num/of/gpus> python main.py --exec_mode train --benchmark --augment --data_dir <path/to/dataset> --model_dir <path/to/checkpoints> --batch_size <batch/size> --warmup_steps <warm-up/steps> --max_steps <max/steps>
			
 
				+```
			
 
				+ 
			
 
				+At the end of the script, a line reporting the best train throughput will be printed.
			
 
				+ 
			
 
				+#### Inference performance benchmark
			
 
				+ 
			
 
				+To benchmark inference, run one of the scripts in `./examples/`:
			
 
				+```bash
			
 
				+bash examples/unet3d_infer_benchmark{_TF-AMP}.sh <path/to/dataset> <path/to/checkpoints> <batch/size>
			
 
				+```
			
 
				+ 
			
 
				+For example, to benchmark inference using mixed-precision with batch size 4:
			
 
				+```bash
			
 
				+bash examples/unet3d_infer_benchmark_TF-AMP.sh <path/to/dataset> <path/to/checkpoints> 4
			
 
				+```
			
 
				+ 
			
 
				+Each of these scripts will by default run 20 warm-up iterations and benchmark the performance during inference in the next 20 iterations.
			
 
				+ 
			
 
				+To have more control, you can run the script by directly providing all relevant run parameters. For example:
			
 
				+```bash
			
 
				+python main.py --exec_mode predict --benchmark --data_dir <path/to/dataset> --model_dir <optional, path/to/checkpoint> --batch_size <batch/size> --warmup_steps <warm-up/steps> --max_steps <max/steps>
			
 
				+```
			
 
				+ 
			
 
				+At the end of the script, a line reporting the best inference throughput will be printed.
			
 
				+
			
 
				+### Results
			
 
				+ 
			
 
				+The following sections provide details on how we achieved our performance and accuracy of training and inference.
			
 
				+ 
			
 
				+#### Training accuracy results
			
 
				+
			
 
				+##### Training accuracy: NVIDIA DGX-1 (8x V100 32GB)
			
 
				+ 
			
 
				+The following table lists the average DICE score across 5-fold cross-validation. Our results were obtained by running the `examples/unet3d_train_full{_TF-AMP}.sh` training script in the `tensorflow:20.06-tf1-py3` NGC container on NVIDIA DGX-1 (8x V100 32GB) GPUs.
			
 
				+ 
			
 
				+| GPUs | Batch size / GPU | DICE - FP32 | DICE - mixed precision | Time to train - FP32 | Time to train - mixed precision | Time to train speedup (FP32 to mixed precision) |
			
 
				+|---|---|--------|--------|--------|--------|------|
			
 
				+| 8 | 2 | 0.8818 | 0.8819 | 41 min | 23 min | 1.78 |
			
 
				+ 
			
 
				+To reproduce this result, start the Docker container interactively and run one of the train scripts:
			
 
				+```bash
			
 
				+bash examples/unet3d_train_full{_TF-AMP}.sh <num/of/gpus> <path/to/dataset> <path/to/checkpoint> <batch/size>
			
 
				+```
			
 
				+ for example to train using 8 GPUs and batch size of 2:
			
 
				+```bash
			
 
				+bash examples/unet3d_train_full_TF-AMP.sh 8 /data/preprocessed /results 2
			
 
				+```
			
 
				+
			
 
				+This command will launch a script which will run 5-fold cross-validation training for 16,000 iterations on each fold and print:
			
 
				+ * the validation DICE scores for each class: Tumor Core (TC), Peritumoral Edema (ED), Enhancing Tumor (ET),
			
 
				+ * the mean DICE score,
			
 
				+ * the whole tumor (WT) which represents a binary classification case (tumor vs background).
			
 
				+ 
			
 
				+The time reported is for one fold, which means that the training of 5 folds will take 5 times longer. The default batch size is 2, however if you have less than 16 GB memory card and you encounter GPU memory issues you should decrease the batch size. The logs of the runs can be found in the `/results` directory once the script is finished.
			
 
				+
			
 
				+#### Training performance results
			
 
				+
			
 
				+##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
			
 
				+ 
			
 
				+Our results were obtained by running the `examples/unet3d_train_benchmark{_TF-AMP}.sh` training script in the `tensorflow:20.06-tf1-py3` NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs. Performance numbers (in volumes per second) were averaged over 80 iterations, excluding the first 40 warm-up steps.
			
 
				+ 
			
 
				+| GPUs | Batch size / GPU | Throughput - FP32 [img/s] | Throughput - mixed precision [img/s] | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |       
			
 
				+|---|---|--------|--------|-------|-------|-------|
			
 
				+| 1 | 2 | 1.987  | 4.381  | 2.205 | N/A   | N/A   |
			
 
				+| 8 | 2 | 14.843 | 28.948 | 1.950 | 7.471 | 6.608 |
			
 
				+
			
 
				+##### Training performance: NVIDIA DGX-1 (8x V100 32GB)
			
 
				+ 
			
 
				+Our results were obtained by running the `examples/unet3d_train_benchmark{_TF-AMP}.sh` training script in the `tensorflow:20.06-tf1-py3` NGC container on NVIDIA DGX-1 with (8x V100 32GB) GPUs. Performance numbers (in volumes per second) were averaged over 80 iterations, excluding the first 40 warm-up steps.
			
 
				+ 
			
 
				+| GPUs | Batch size / GPU | Throughput - FP32 [img/s] | Throughput - mixed precision [img/s] | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |       
			
 
				+|---|---|--------|--------|-------|-------|-------|
			
 
				+| 1 | 2 | 2.002  | 4.360  | 2.177 | N/A   | N/A   |
			
 
				+| 1 | 4 | 2.160  | 4.407  | 2.041 | N/A   | N/A   |
			
 
				+| 8 | 2 | 14.781 | 26.694 | 1.806 | 7.381 | 6.123 |
			
 
				+| 8 | 4 | 16.013 | 28.423 | 1.775 | 7.414 | 6.449 |
			
 
				+
			
 
				+ 
			
 
				+To achieve these same results, follow the steps in the [Training performance benchmark](#training-performance-benchmark) section.
			
 
				+ 
			
 
				+#### Inference performance results
			
 
				+
			
 
				+##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
			
 
				+ 
			
 
				+Our results were obtained by running the `examples/unet3d_infer_benchmark{_TF-AMP}.sh` inferencing benchmarking script in the `tensorflow:20.06-tf1-py3` NGC container on NVIDIA DGX-1 with (1x V100 16GB) GPU. Performance numbers (in volumes per second) were averaged over 40 iterations, excluding the first 20 warm-up steps.
			
 
				+ 
			
 
				+FP16
			
 
				+ 
			
 
				+| Batch size | Resolution | Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
			
 
				+|---|---------------|-------|----------|----------|----------|----------|
			
 
				+| 1 | 224x224x160x4 | 2.546 | 392.803  | 393.031  | 393.075  | 393.160  |
			
 
				+| 2 | 224x224x160x4 | 2.923 | 684.363  | 684.806  | 684.891  | 685.056  |
			
 
				+| 4 | 224x224x160x4 | 3.408 | 1173.739 | 1174.369 | 1174.489 | 1174.725 |
			
 
				+ 
			
 
				+FP32
			
 
				+ 
			
 
				+| Batch size | Resolution | Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
			
 
				+|---|---------------|-------|----------|----------|----------|----------|
			
 
				+| 1 | 224x224x160x4 | 1.527 | 654.911  | 655.180  | 655.232  | 655.333  |
			
 
				+| 2 | 224x224x160x4 | 1.554 | 1287.376 | 1287.997 | 1288.116 | 1288.348 |
			
 
				+| 4 | 224x224x160x4 | OOM   |          |          |          |          |
			
 
				+ 
			
 
				+ 
			
 
				+##### Inference performance: NVIDIA DGX-1 (1x V100 32GB)
			
 
				+ 
			
 
				+Our results were obtained by running the `examples/unet3d_infer_benchmark{_TF-AMP}.sh` inferencing benchmarking script in the `tensorflow:20.06-tf1-py3` NGC container on NVIDIA DGX-1 with (1x V100 32GB) GPU. Performance numbers (in volumes per second) were averaged over 40 iterations, excluding the first 20 warm-up steps.
			
 
				+
			
 
				+ 
			
 
				+FP16
			
 
				+ 
			
 
				+| Batch size | Resolution | Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
			
 
				+|---|---------------|-------|----------|----------|----------|----------|
			
 
				+| 1 | 224x224x160x4 | 2.576 | 388.276  | 388.400  | 388.423  | 388.470  |
			
 
				+| 2 | 224x224x160x4 | 2.861 | 699.078  | 699.567  | 699.660  | 699.843  |
			
 
				+| 4 | 224x224x160x4 | 3.333 | 1200.198 | 1200.631 | 1200.714 | 1200.877 |
			
 
				+ 
			
 
				+FP32
			
 
				+ 
			
 
				+| Batch size | Resolution | Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
			
 
				+|---|---------------|-------|----------|----------|----------|----------|
			
 
				+| 1 | 224x224x160x4 | 1.990 | 502.485  | 502.550  | 502.563  | 502.587  |
			
 
				+| 2 | 224x224x160x4 | 2.013 | 993.650  | 993.982  | 994.046  | 994.170  |
			
 
				+| 4 | 224x224x160x4 | 2.435 | 1642.637 | 1643.058 | 1643.139 | 1643.297 |
			
 
				+ 
			
 
				+To achieve these same results, follow the steps in the [Inference performance benchmark](#inference-performance-benchmark) section.
			
 
				+ 
			
 
				+
			
 
				+ 
			
 
				+## Release notes
			
 
				+ 
			
 
				+### Changelog
			
 
				+ 
			
 
				+June 2020
			
 
				+* Initial release
			
 
				+ 
			
 
				+ 
			
 
				+### Known issues
			
 
				+ 
			
 
				+There are no known issues in this release.
			
 
				+
			
 
				+ 
			
 
				+
			
 
				+
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/dataset/data_loader.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/dataset/data_loader.py
@@ -0,0 +1,254 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+import horovod.tensorflow as hvd
			
 
				+import numpy as np
			
 
				+import tensorflow as tf
			
 
				+
			
 
				+from dataset.transforms import NormalizeImages, OneHotLabels, apply_transforms, PadXYZ, RandomCrop3D, \
			
 
				+    RandomHorizontalFlip, RandomGammaCorrection, RandomVerticalFlip, RandomBrightnessCorrection, CenterCrop, \
			
 
				+    apply_test_transforms, Cast
			
 
				+
			
 
				+CLASSES = {0: "TumorCore", 1: "PeritumoralEdema", 2: "EnhancingTumor"}
			
 
				+
			
 
				+
			
 
				+def cross_validation(x: np.ndarray, fold_idx: int, n_folds: int):
			
 
				+    if fold_idx < 0 or fold_idx >= n_folds:
			
 
				+        raise ValueError('Fold index has to be [0, n_folds). Received index {} for {} folds'.format(fold_idx, n_folds))
			
 
				+
			
 
				+    _folders = np.array_split(x, n_folds)
			
 
				+
			
 
				+    return np.concatenate(_folders[:fold_idx] + _folders[fold_idx + 1:]), _folders[fold_idx]
			
 
				+
			
 
				+
			
 
				+class Dataset:
			
 
				+    def __init__(self, data_dir, batch_size=2, fold_idx=0, n_folds=5, seed=0, pipeline_factor=1, params=None):
			
 
				+        self._folders = np.array([os.path.join(data_dir, path) for path in os.listdir(data_dir)])
			
 
				+        self._train, self._eval = cross_validation(self._folders, fold_idx=fold_idx, n_folds=n_folds)
			
 
				+        self._pipeline_factor = pipeline_factor
			
 
				+        self._data_dir = data_dir
			
 
				+        self.params = params
			
 
				+
			
 
				+        self._batch_size = batch_size
			
 
				+        self._seed = seed
			
 
				+
			
 
				+        self._xshape = (240, 240, 155, 4)
			
 
				+        self._yshape = (240, 240, 155)
			
 
				+
			
 
				+    def parse(self, serialized):
			
 
				+        features = {
			
 
				+            'X': tf.io.FixedLenFeature([], tf.string),
			
 
				+            'Y': tf.io.FixedLenFeature([], tf.string),
			
 
				+            'mean': tf.io.FixedLenFeature([4], tf.float32),
			
 
				+            'stdev': tf.io.FixedLenFeature([4], tf.float32)
			
 
				+        }
			
 
				+
			
 
				+        parsed_example = tf.io.parse_single_example(serialized=serialized,
			
 
				+                                                    features=features)
			
 
				+
			
 
				+        x = tf.io.decode_raw(parsed_example['X'], tf.uint8)
			
 
				+        x = tf.cast(tf.reshape(x, self._xshape), tf.uint8)
			
 
				+        y = tf.io.decode_raw(parsed_example['Y'], tf.uint8)
			
 
				+        y = tf.cast(tf.reshape(y, self._yshape), tf.uint8)
			
 
				+
			
 
				+        mean = parsed_example['mean']
			
 
				+        stdev = parsed_example['stdev']
			
 
				+
			
 
				+        return x, y, mean, stdev
			
 
				+
			
 
				+    def parse_x(self, serialized):
			
 
				+        features = {'X': tf.io.FixedLenFeature([], tf.string),
			
 
				+                    'Y': tf.io.FixedLenFeature([], tf.string),
			
 
				+                    'mean': tf.io.FixedLenFeature([4], tf.float32),
			
 
				+                    'stdev': tf.io.FixedLenFeature([4], tf.float32)}
			
 
				+
			
 
				+        parsed_example = tf.io.parse_single_example(serialized=serialized,
			
 
				+                                                    features=features)
			
 
				+
			
 
				+        x = tf.io.decode_raw(parsed_example['X'], tf.uint8)
			
 
				+        x = tf.cast(tf.reshape(x, self._xshape), tf.uint8)
			
 
				+
			
 
				+        mean = parsed_example['mean']
			
 
				+        stdev = parsed_example['stdev']
			
 
				+
			
 
				+        return x, mean, stdev
			
 
				+
			
 
				+    def train_fn(self):
			
 
				+        assert len(self._train) > 0, "Training data not found."
			
 
				+
			
 
				+        ds = tf.data.TFRecordDataset(filenames=self._train)
			
 
				+
			
 
				+        ds = ds.shard(hvd.size(), hvd.rank())
			
 
				+        ds = ds.cache()
			
 
				+        ds = ds.shuffle(buffer_size=self._batch_size * 8, seed=self._seed)
			
 
				+        ds = ds.repeat()
			
 
				+
			
 
				+        ds = ds.map(self.parse, num_parallel_calls=tf.data.experimental.AUTOTUNE)
			
 
				+
			
 
				+        transforms = [
			
 
				+            RandomCrop3D((128, 128, 128)),
			
 
				+            RandomHorizontalFlip() if self.params.augment else None,
			
 
				+            Cast(dtype=tf.float32),
			
 
				+            NormalizeImages(),
			
 
				+            RandomBrightnessCorrection() if self.params.augment else None,
			
 
				+            OneHotLabels(n_classes=4),
			
 
				+        ]
			
 
				+
			
 
				+        ds = ds.map(map_func=lambda x, y, mean, stdev: apply_transforms(x, y, mean, stdev, transforms=transforms),
			
 
				+                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
			
 
				+
			
 
				+        ds = ds.batch(batch_size=self._batch_size,
			
 
				+                      drop_remainder=True)
			
 
				+
			
 
				+        ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
			
 
				+
			
 
				+        return ds
			
 
				+
			
 
				+    def eval_fn(self):
			
 
				+        ds = tf.data.TFRecordDataset(filenames=self._eval)
			
 
				+        assert len(self._eval) > 0, "Evaluation data not found. Did you specify --fold flag?"
			
 
				+
			
 
				+        ds = ds.cache()
			
 
				+        ds = ds.map(self.parse, num_parallel_calls=tf.data.experimental.AUTOTUNE)
			
 
				+
			
 
				+        transforms = [
			
 
				+            CenterCrop((224, 224, 155)),
			
 
				+            Cast(dtype=tf.float32),
			
 
				+            NormalizeImages(),
			
 
				+            OneHotLabels(n_classes=4),
			
 
				+            PadXYZ()
			
 
				+        ]
			
 
				+
			
 
				+        ds = ds.map(map_func=lambda x, y, mean, stdev: apply_transforms(x, y, mean, stdev, transforms=transforms),
			
 
				+                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
			
 
				+        ds = ds.batch(batch_size=self._batch_size,
			
 
				+                      drop_remainder=False)
			
 
				+        ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
			
 
				+
			
 
				+        return ds
			
 
				+
			
 
				+    def test_fn(self, count=1, drop_remainder=False):
			
 
				+        ds = tf.data.TFRecordDataset(filenames=self._eval)
			
 
				+        assert len(self._eval) > 0, "Evaluation data not found. Did you specify --fold flag?"
			
 
				+
			
 
				+        ds = ds.repeat(count)
			
 
				+        ds = ds.map(self.parse_x, num_parallel_calls=tf.data.experimental.AUTOTUNE)
			
 
				+
			
 
				+        transforms = [
			
 
				+            CenterCrop((224, 224, 155)),
			
 
				+            Cast(dtype=tf.float32),
			
 
				+            NormalizeImages(),
			
 
				+            PadXYZ((224, 224, 160))
			
 
				+        ]
			
 
				+
			
 
				+        ds = ds.map(map_func=lambda x, mean, stdev: apply_test_transforms(x, mean, stdev, transforms=transforms),
			
 
				+                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
			
 
				+        ds = ds.batch(batch_size=self._batch_size,
			
 
				+                      drop_remainder=drop_remainder)
			
 
				+        ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
			
 
				+
			
 
				+        return ds
			
 
				+
			
 
				+    def synth_train_fn(self):
			
 
				+        """Synthetic data function for testing"""
			
 
				+        inputs = tf.random_uniform(self._xshape, dtype=tf.int32, minval=0, maxval=255, seed=self._seed,
			
 
				+                                   name='synth_inputs')
			
 
				+        masks = tf.random_uniform(self._yshape, dtype=tf.int32, minval=0, maxval=4, seed=self._seed,
			
 
				+                                  name='synth_masks')
			
 
				+
			
 
				+        ds = tf.data.Dataset.from_tensors((inputs, masks))
			
 
				+        ds = ds.repeat()
			
 
				+
			
 
				+        transforms = [
			
 
				+            Cast(dtype=tf.uint8),
			
 
				+            RandomCrop3D((128, 128, 128)),
			
 
				+            RandomHorizontalFlip() if self.params.augment else None,
			
 
				+            Cast(dtype=tf.float32),
			
 
				+            NormalizeImages(),
			
 
				+            RandomBrightnessCorrection() if self.params.augment else None,
			
 
				+            OneHotLabels(n_classes=4),
			
 
				+        ]
			
 
				+
			
 
				+        ds = ds.map(map_func=lambda x, y: apply_transforms(x, y, transforms),
			
 
				+                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
			
 
				+        ds = ds.batch(self._batch_size)
			
 
				+        ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
			
 
				+
			
 
				+        return ds
			
 
				+
			
 
				+    def synth_predict_fn(self, count=1):
			
 
				+        """Synthetic data function for testing"""
			
 
				+        inputs = tf.truncated_normal((64, 64, 64, 4), dtype=tf.float32, mean=0.0, stddev=1.0, seed=self._seed,
			
 
				+                                     name='synth_inputs')
			
 
				+
			
 
				+        ds = tf.data.Dataset.from_tensors(inputs)
			
 
				+        ds = ds.repeat(count)
			
 
				+        ds = ds.batch(self._batch_size)
			
 
				+        ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
			
 
				+
			
 
				+        return ds
			
 
				+
			
 
				+    @property
			
 
				+    def train_size(self):
			
 
				+        return len(self._train)
			
 
				+
			
 
				+    @property
			
 
				+    def eval_size(self):
			
 
				+        return len(self._eval)
			
 
				+
			
 
				+    @property
			
 
				+    def test_size(self):
			
 
				+        return len(self._eval)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    from time import time
			
 
				+    hvd.init()
			
 
				+
			
 
				+    dataset = Dataset(data_dir='/data/BraTS19_tfrecord', batch_size=3)
			
 
				+
			
 
				+    it = dataset.test().make_initializable_iterator()
			
 
				+
			
 
				+    sess = tf.Session()
			
 
				+    sess.run(it.initializer)
			
 
				+
			
 
				+    next_element = it.get_next()
			
 
				+
			
 
				+    t0 = time()
			
 
				+    cnt = 0
			
 
				+    # while True:
			
 
				+    import matplotlib.pyplot as plt
			
 
				+    import numpy.ma as ma
			
 
				+    for i in range(200):
			
 
				+        t0 = time()
			
 
				+        # if i == 20:
			
 
				+        #     t0 = time()
			
 
				+
			
 
				+        res = sess.run(next_element)
			
 
				+        a = res[0]
			
 
				+        a = a[0, :, :, 80, 0]
			
 
				+        a = ma.masked_array(a, mask=a == 0)
			
 
				+        # plt.imshow(a.astype(np.uint8))
			
 
				+        plt.imshow(a)
			
 
				+        plt.colorbar()
			
 
				+        plt.savefig("/opt/project/img.png")
			
 
				+
			
 
				+        # print()
			
 
				+        print(time() - t0)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/dataset/preprocess_data.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/dataset/preprocess_data.py
@@ -0,0 +1,161 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import os
			
 
				+import argparse
			
 
				+from random import shuffle
			
 
				+import numpy as np
			
 
				+
			
 
				+import nibabel as nib
			
 
				+import tensorflow as tf
			
 
				+
			
 
				+
			
 
				+PARSER = argparse.ArgumentParser()
			
 
				+
			
 
				+PARSER.add_argument('--input_dir', '-i',
			
 
				+                    type=str, help='path to the input directory with data')
			
 
				+
			
 
				+PARSER.add_argument('--output_dir', '-o',
			
 
				+                    type=str, help='path to the output directory where tfrecord files will be stored')
			
 
				+
			
 
				+PARSER.add_argument('--verbose', '-v', dest='verbose', action='store_true', default=False)
			
 
				+
			
 
				+PARSER.add_argument('--vol_per_file', default=4, dest='vol_per_file',
			
 
				+                    type=int, help='how many volumes to pack into a single tfrecord file')
			
 
				+
			
 
				+PARSER.add_argument('--single_data_dir', dest='single_data_dir', action='store_true', default=False)
			
 
				+
			
 
				+
			
 
				+def load_features(path):
			
 
				+    data = np.zeros((240, 240, 155, 4), dtype=np.uint8)
			
 
				+    name = os.path.basename(path)
			
 
				+    for i, modality in enumerate(["_t1.nii.gz", "_t1ce.nii.gz", "_t2.nii.gz", "_flair.nii.gz"]):
			
 
				+        vol = load_single_nifti(os.path.join(path, name+modality)).astype(np.float32)
			
 
				+        vol[vol > 0.85 * vol.max()] = 0.85 * vol.max()
			
 
				+        vol = 255 * vol / vol.max()
			
 
				+        data[..., i] = vol.astype(np.uint8)
			
 
				+
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				+def load_segmentation(path):
			
 
				+    path = os.path.join(path, os.path.basename(path)) + "_seg.nii.gz"
			
 
				+    return load_single_nifti(path).astype(np.uint8)
			
 
				+
			
 
				+
			
 
				+def load_single_nifti(path):
			
 
				+    data = nib.load(path).get_fdata().astype(np.int16)
			
 
				+    return np.transpose(data, (1, 0, 2))
			
 
				+
			
 
				+
			
 
				+def write_to_file(features_list, labels_list, foreground_mean_list, foreground_std_list, output_dir, count):
			
 
				+    output_filename = os.path.join(output_dir, "volume-{}.tfrecord".format(count))
			
 
				+    filelist = list(zip(np.array(features_list),
			
 
				+                        np.array(labels_list),
			
 
				+                        np.array(foreground_mean_list),
			
 
				+                        np.array(foreground_std_list)))
			
 
				+    np_to_tfrecords(filelist, output_filename)
			
 
				+
			
 
				+
			
 
				+def np_to_tfrecords(filelist, output_filename):
			
 
				+    writer = tf.io.TFRecordWriter(output_filename)
			
 
				+
			
 
				+    for idx in range(len(filelist)):
			
 
				+        X = filelist[idx][0].flatten().tostring()
			
 
				+        Y = filelist[idx][1].flatten().tostring()
			
 
				+        mean = filelist[idx][2].astype(np.float32).flatten()
			
 
				+        stdev = filelist[idx][3].astype(np.float32).flatten()
			
 
				+
			
 
				+        d_feature = {}
			
 
				+        d_feature['X'] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[X]))
			
 
				+        d_feature['Y'] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[Y]))
			
 
				+        d_feature['mean'] = tf.train.Feature(float_list=tf.train.FloatList(value=mean))
			
 
				+        d_feature['stdev'] = tf.train.Feature(float_list=tf.train.FloatList(value=stdev))
			
 
				+
			
 
				+        features = tf.train.Features(feature=d_feature)
			
 
				+        example = tf.train.Example(features=features)
			
 
				+        serialized = example.SerializeToString()
			
 
				+        writer.write(serialized)
			
 
				+    writer.close()
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    # parse arguments
			
 
				+    params = PARSER.parse_args()
			
 
				+    input_dir = params.input_dir
			
 
				+    output_dir = params.output_dir
			
 
				+    os.makedirs(params.output_dir, exist_ok=True)
			
 
				+
			
 
				+    patient_list = []
			
 
				+    if params.single_data_dir:
			
 
				+        patient_list.extend([os.path.join(input_dir, folder) for folder in os.listdir(input_dir)])
			
 
				+    else:
			
 
				+        assert "HGG" in os.listdir(input_dir) and "LGG" in os.listdir(input_dir),\
			
 
				+            "Data directory has to contain folders named HGG and LGG. " \
			
 
				+            "If you have a single folder with patient's data please set --single_data_dir flag"
			
 
				+        path_hgg = os.path.join(input_dir, "HGG")
			
 
				+        path_lgg = os.path.join(input_dir, "LGG")
			
 
				+        patient_list.extend([os.path.join(path_hgg, folder) for folder in os.listdir(path_hgg)])
			
 
				+        patient_list.extend([os.path.join(path_lgg, folder) for folder in os.listdir(path_lgg)])
			
 
				+    shuffle(patient_list)
			
 
				+
			
 
				+    features_list = []
			
 
				+    labels_list = []
			
 
				+    foreground_mean_list = []
			
 
				+    foreground_std_list = []
			
 
				+    count = 0
			
 
				+
			
 
				+    total_tfrecord_files = len(patient_list) // params.vol_per_file + (1 if len(patient_list) % params.vol_per_file
			
 
				+                                                                       else 0)
			
 
				+    for i, folder in enumerate(patient_list):
			
 
				+
			
 
				+        # Calculate mean and stdev only for foreground voxels
			
 
				+        features = load_features(folder)
			
 
				+        foreground = features > 0
			
 
				+        fg_mean = np.array([(features[..., i][foreground[..., i]]).mean() for i in range(features.shape[-1])])
			
 
				+        fg_std = np.array([(features[..., i][foreground[..., i]]).std() for i in range(features.shape[-1])])
			
 
				+
			
 
				+        # BraTS labels are 0,1,2,4 -> switching to 0,1,2,3
			
 
				+        labels = load_segmentation(folder)
			
 
				+        labels[labels == 4] = 3
			
 
				+
			
 
				+        features_list.append(features)
			
 
				+        labels_list.append(labels)
			
 
				+        foreground_mean_list.append(fg_mean)
			
 
				+        foreground_std_list.append(fg_std)
			
 
				+
			
 
				+        if (i+1) % params.vol_per_file == 0:
			
 
				+            write_to_file(features_list, labels_list, foreground_mean_list, foreground_std_list, output_dir, count)
			
 
				+
			
 
				+            # Clear lists
			
 
				+            features_list = []
			
 
				+            labels_list = []
			
 
				+            foreground_mean_list = []
			
 
				+            foreground_std_list = []
			
 
				+            count += 1
			
 
				+
			
 
				+            if params.verbose:
			
 
				+                print("{}/{} tfrecord files created".format(count, total_tfrecord_files))
			
 
				+
			
 
				+    # create one more file if there are any remaining unpacked volumes
			
 
				+    if features_list:
			
 
				+        write_to_file(features_list, labels_list, foreground_mean_list, foreground_std_list, output_dir, count)
			
 
				+        count += 1
			
 
				+        if params.verbose:
			
 
				+            print("{}/{} tfrecord files created".format(count, total_tfrecord_files))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
 
				+
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/dataset/transforms.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/dataset/transforms.py
@@ -0,0 +1,208 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import tensorflow as tf
			
 
				+
			
 
				+
			
 
				+def apply_transforms(x, y, mean, stdev, transforms):
			
 
				+    for _t in transforms:
			
 
				+        if _t is not None:
			
 
				+            x, y = _t(x, y, mean, stdev)
			
 
				+    return x, y
			
 
				+
			
 
				+
			
 
				+def apply_test_transforms(x, mean, stdev, transforms):
			
 
				+    for _t in transforms:
			
 
				+        if _t is not None:
			
 
				+            x = _t(x, y=None, mean=mean, stdev=stdev)
			
 
				+    return x
			
 
				+
			
 
				+
			
 
				+class PadXYZ:
			
 
				+    def __init__(self, shape=None):
			
 
				+        self.shape = shape
			
 
				+
			
 
				+    def __call__(self, x, y, mean, stdev):
			
 
				+        paddings = tf.constant([[0, 0], [0, 0], [0, 5], [0, 0]])
			
 
				+        x = tf.pad(x, paddings, "CONSTANT")
			
 
				+        if y is None:
			
 
				+            return x
			
 
				+        y = tf.pad(y, paddings, "CONSTANT")
			
 
				+        return x, y
			
 
				+
			
 
				+
			
 
				+class CenterCrop:
			
 
				+    def __init__(self, shape):
			
 
				+        self.shape = shape
			
 
				+
			
 
				+    def __call__(self, x, y, mean, stdev):
			
 
				+        shape = x.get_shape()
			
 
				+        delta = [(shape[i].value - self.shape[i]) // 2 for i in range(len(self.shape))]
			
 
				+        x = x[
			
 
				+            delta[0]:delta[0] + self.shape[0],
			
 
				+            delta[1]:delta[1] + self.shape[1],
			
 
				+            delta[2]:delta[2] + self.shape[2]
			
 
				+            ]
			
 
				+        if y is None:
			
 
				+            return x
			
 
				+        y = y[
			
 
				+            delta[0]:delta[0] + self.shape[0],
			
 
				+            delta[1]:delta[1] + self.shape[1],
			
 
				+            delta[2]:delta[2] + self.shape[2]
			
 
				+            ]
			
 
				+        return x, y
			
 
				+
			
 
				+
			
 
				+class RandomCrop3D:
			
 
				+    def __init__(self, shape, margins=(0, 0, 0)):
			
 
				+        self.shape = shape
			
 
				+        self.margins = margins
			
 
				+
			
 
				+    def __call__(self, x, y, mean, stdev):
			
 
				+        shape = x.get_shape()
			
 
				+        min = tf.constant(self.margins, dtype=tf.float32)
			
 
				+        max = tf.constant([shape[0].value - self.shape[0] - self.margins[0],
			
 
				+                           shape[1].value - self.shape[1] - self.margins[1],
			
 
				+                           shape[2].value - self.shape[2] - self.margins[2]], dtype=tf.float32)
			
 
				+        center = tf.random_uniform((len(self.shape),), minval=min, maxval=max)
			
 
				+        center = tf.cast(center, dtype=tf.int32)
			
 
				+        x = x[center[0]:center[0] + self.shape[0],
			
 
				+              center[1]:center[1] + self.shape[1],
			
 
				+              center[2]:center[2] + self.shape[2]]
			
 
				+        if y is None:
			
 
				+            return x
			
 
				+        y = y[center[0]:center[0] + self.shape[0],
			
 
				+              center[1]:center[1] + self.shape[1],
			
 
				+              center[2]:center[2] + self.shape[2]]
			
 
				+        return x, y
			
 
				+
			
 
				+
			
 
				+class NormalizeImages:
			
 
				+    def __init__(self):
			
 
				+        pass
			
 
				+
			
 
				+    def __call__(self, x, y, mean, stdev):
			
 
				+        mask = tf.math.greater(x, 0)
			
 
				+        x = tf.where(mask, (x - tf.cast(mean, x.dtype)) / (tf.cast(stdev + 1e-8, x.dtype)), x)
			
 
				+
			
 
				+        if y is None:
			
 
				+            return x
			
 
				+        return x, y
			
 
				+
			
 
				+
			
 
				+class Cast:
			
 
				+    def __init__(self, dtype=tf.float32):
			
 
				+        self._dtype = dtype
			
 
				+
			
 
				+    def __call__(self, x, y, mean, stdev):
			
 
				+        if y is None:
			
 
				+            return tf.cast(x, dtype=self._dtype)
			
 
				+        return tf.cast(x, dtype=self._dtype), y
			
 
				+
			
 
				+
			
 
				+class RandomHorizontalFlip:
			
 
				+    def __init__(self, threshold=0.5):
			
 
				+        self._threshold = threshold
			
 
				+
			
 
				+    def __call__(self, x, y, mean, stdev):
			
 
				+        h_flip = tf.random_uniform([]) > self._threshold
			
 
				+
			
 
				+        x = tf.cond(h_flip, lambda: tf.reverse(x, axis=[1]), lambda: x)
			
 
				+        y = tf.cond(h_flip, lambda: tf.reverse(y, axis=[1]), lambda: y)
			
 
				+
			
 
				+        return x, y
			
 
				+
			
 
				+
			
 
				+class RandomVerticalFlip:
			
 
				+    def __init__(self, threshold=0.5):
			
 
				+        self._threshold = threshold
			
 
				+
			
 
				+    def __call__(self, x, y, mean, stdev):
			
 
				+        h_flip = tf.random_uniform([]) > self._threshold
			
 
				+
			
 
				+        x = tf.cond(h_flip, lambda: tf.reverse(x, axis=[0]), lambda: x)
			
 
				+        y = tf.cond(h_flip, lambda: tf.reverse(y, axis=[0]), lambda: y)
			
 
				+
			
 
				+        return x, y
			
 
				+
			
 
				+
			
 
				+class RandomGammaCorrection:
			
 
				+    def __init__(self, gamma_range=(0.8, 1.5), keep_stats=False, threshold=0.5, epsilon=1e-8):
			
 
				+        self._gamma_range = gamma_range
			
 
				+        self._keep_stats = keep_stats
			
 
				+        self._eps = epsilon
			
 
				+        self._threshold = threshold
			
 
				+
			
 
				+    def __call__(self, x, y, mean, stdev):
			
 
				+        augment = tf.random_uniform([]) > self._threshold
			
 
				+        gamma = tf.random_uniform([], minval=self._gamma_range[0], maxval=self._gamma_range[1])
			
 
				+
			
 
				+        x_min = tf.math.reduce_min(x)
			
 
				+        x_range = tf.math.reduce_max(x) - x_min
			
 
				+
			
 
				+        x = tf.cond(augment,
			
 
				+                    lambda: tf.math.pow(((x - x_min) / float(x_range + self._eps)), gamma) * x_range + x_min,
			
 
				+                    lambda: x)
			
 
				+        return x, y
			
 
				+
			
 
				+
			
 
				+class RandomBrightnessCorrection:
			
 
				+    def __init__(self, alpha=0.1, threshold=0.5, per_channel=True):
			
 
				+        self._alpha_range = [1.0 - alpha, 1.0 + alpha]
			
 
				+        self._threshold = threshold
			
 
				+        self._per_channel = per_channel
			
 
				+
			
 
				+    def __call__(self, x, y, mean, stdev):
			
 
				+        mask = tf.math.greater(x, 0)
			
 
				+        size = x.get_shape()[-1].value if self._per_channel else 1
			
 
				+        augment = tf.random_uniform([]) > self._threshold
			
 
				+        correction = tf.random_uniform([size],
			
 
				+                                       minval=self._alpha_range[0],
			
 
				+                                       maxval=self._alpha_range[1],
			
 
				+                                       dtype=x.dtype)
			
 
				+
			
 
				+        x = tf.cond(augment,
			
 
				+                    lambda: tf.where(mask, x + correction, x),
			
 
				+                    lambda: x)
			
 
				+
			
 
				+        return x, y
			
 
				+
			
 
				+
			
 
				+class OneHotLabels:
			
 
				+    def __init__(self, n_classes=1):
			
 
				+        self._n_classes = n_classes
			
 
				+
			
 
				+    def __call__(self, x, y, mean, stdev):
			
 
				+        return x, tf.one_hot(y, self._n_classes)
			
 
				+
			
 
				+
			
 
				+class PadXY:
			
 
				+    def __init__(self, dst_size=None):
			
 
				+        if not dst_size:
			
 
				+            raise ValueError("Invalid padding size: {}".format(dst_size))
			
 
				+
			
 
				+        self._dst_size = dst_size
			
 
				+
			
 
				+    def __call__(self, x, y, mean, stdev):
			
 
				+        return tf.pad(x, self._build_padding(x)), \
			
 
				+               tf.pad(y, self._build_padding(y))
			
 
				+
			
 
				+    def _build_padding(self, _t):
			
 
				+        padding = []
			
 
				+        for i in range(len(_t.shape)):
			
 
				+            if i < len(self._dst_size):
			
 
				+                padding.append((0, self._dst_size[i] - _t.shape[i]))
			
 
				+            else:
			
 
				+                padding.append((0, 0))
			
 
				+        return padding
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_infer_benchmark.sh
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_infer_benchmark.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches 3D-UNet run FP32 inference benchmark.
			
 
				+# Usage:
			
 
				+# bash examples/unet3d_infer_benchmark.sh <path/to/dataset> <path/to/results/directory> <batch/size>
			
 
				+
			
 
				+python main.py --data_dir $1 --model_dir $2 --exec_mode predict --warmup_steps 20 --fold 0 --batch_size $3 --benchmark --xla
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_infer_benchmark_TF-AMP.sh
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_infer_benchmark_TF-AMP.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches 3D-UNet run TF-AMP inference benchmark.
			
 
				+# Usage:
			
 
				+# bash examples/unet3d_infer_benchmark_TF-AMP.sh <path/to/dataset> <path/to/results/directory> <batch/size>
			
 
				+
			
 
				+python main.py --data_dir $1 --model_dir $2 --exec_mode predict --warmup_steps 20 --fold 0 --batch_size $3 --benchmark --amp --xla
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_benchmark.sh
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_benchmark.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches 3D-UNet run FP32 train benchmark.
			
 
				+# Usage:
			
 
				+# bash examples/unet3d_train_benchmark.sh <number/of/gpus> <path/to/dataset> <path/to/results/directory> <batch/size>
			
 
				+
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --exec_mode train --max_steps 80 --benchmark --fold 0 --batch_size $4 --xla --augment
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_benchmark_TF-AMP.sh
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_benchmark_TF-AMP.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches 3D-UNet run TF-AMP train benchmark.
			
 
				+# Usage:
			
 
				+# bash examples/unet3d_train_benchmark.sh <number/of/gpus> <path/to/dataset> <path/to/results/directory> <batch/size>
			
 
				+
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --exec_mode train --max_steps 80 --benchmark --fold 0 --batch_size $4 --amp --xla --augment
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_full.sh
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_full.sh
@@ -0,0 +1,25 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches 3D-UNet run 5-fold cross-validation FP32 training for 16000 iterations each.
			
 
				+# Usage:
			
 
				+# bash examples/unet3d_train_full.sh <number/of/gpus> <path/to/dataset> <path/to/results/directory> <batch/size>
			
 
				+
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --log_dir $3/log.json --exec_mode train_and_evaluate --max_steps 16000 --augment --batch_size $4 --fold 0 --use_xla > $3/log_FP32_$1GPU_fold0.txt
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --log_dir $3/log.json --exec_mode train_and_evaluate --max_steps 16000 --augment --batch_size $4 --fold 1 --use_xla > $3/log_FP32_$1GPU_fold1.txt
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --log_dir $3/log.json --exec_mode train_and_evaluate --max_steps 16000 --augment --batch_size $4 --fold 2 --use_xla > $3/log_FP32_$1GPU_fold2.txt
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --log_dir $3/log.json --exec_mode train_and_evaluate --max_steps 16000 --augment --batch_size $4 --fold 3 --use_xla > $3/log_FP32_$1GPU_fold3.txt
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --log_dir $3/log.json --exec_mode train_and_evaluate --max_steps 16000 --augment --batch_size $4 --fold 4 --use_xla > $3/log_FP32_$1GPU_fold4.txt
			
 
				+
			
 
				+python runtime/parse_results.py --model_dir $3 --env FP32_$1GPU
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_full_TF-AMP.sh
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_full_TF-AMP.sh
@@ -0,0 +1,25 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches 3D-UNet run 5-fold cross-validation TF-AMP training for 16000 iterations each.
			
 
				+# Usage:
			
 
				+# bash examples/unet3d_train_full_TF-AMP.sh <number/of/gpus> <path/to/dataset> <path/to/results/directory> <batch/size>
			
 
				+
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --log_dir $3/log.json --exec_mode train_and_evaluate --max_steps 16000 --augment --batch_size $4 --fold 0 --use_xla --use_amp > $3/log_TF-AMP_$1GPU_fold0.txt
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --log_dir $3/log.json --exec_mode train_and_evaluate --max_steps 16000 --augment --batch_size $4 --fold 1 --use_xla --use_amp > $3/log_TF-AMP_$1GPU_fold1.txt
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --log_dir $3/log.json --exec_mode train_and_evaluate --max_steps 16000 --augment --batch_size $4 --fold 2 --use_xla --use_amp > $3/log_TF-AMP_$1GPU_fold2.txt
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --log_dir $3/log.json --exec_mode train_and_evaluate --max_steps 16000 --augment --batch_size $4 --fold 3 --use_xla --use_amp > $3/log_TF-AMP_$1GPU_fold3.txt
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --log_dir $3/log.json --exec_mode train_and_evaluate --max_steps 16000 --augment --batch_size $4 --fold 4 --use_xla --use_amp > $3/log_TF-AMP_$1GPU_fold4.txt
			
 
				+
			
 
				+python runtime/parse_results.py --model_dir $3 --env TF-AMP_$1GPU
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_single.sh
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_single.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches 3D-UNet run FP32 training on fold 0 for 16000 iterations each.
			
 
				+# Usage:
			
 
				+# bash examples/unet3d_train_single.sh <number/of/gpus> <path/to/dataset> <path/to/results/directory> <batch/size>
			
 
				+
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --exec_mode train_and_evaluate --augment --max_steps 16000 --batch_size $4 --xla --fold 0
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_single_TF-AMP.sh
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/examples/unet3d_train_single_TF-AMP.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+# This script launches 3D-UNet run TF-AMP training on fold 0 for 16000 iterations each.
			
 
				+# Usage:
			
 
				+# bash examples/unet3d_train_single_TF-AMP.sh <number/of/gpus> <path/to/dataset> <path/to/results/directory> <batch/size>
			
 
				+
			
 
				+horovodrun -np $1 python main.py --data_dir $2 --model_dir $3 --exec_mode train_and_evaluate --augment --max_steps 16000 --batch_size $4 --xla --amp --fold 0
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/images/U-Net3D_TF1_conv.png
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/images/U-Net3D_TF1_conv.png
--- a/TensorFlow/Segmentation/UNet_3D_Medical/images/unet3d.png
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/images/unet3d.png
--- a/TensorFlow/Segmentation/UNet_3D_Medical/main.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/main.py
@@ -0,0 +1,109 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import os
			
 
				+import logging
			
 
				+
			
 
				+import numpy as np
			
 
				+import tensorflow as tf
			
 
				+import horovod.tensorflow as hvd
			
 
				+
			
 
				+from dataset.data_loader import Dataset, CLASSES
			
 
				+from runtime.hooks import get_hooks, ProfilingHook, TrainingHook
			
 
				+from runtime.arguments import PARSER
			
 
				+from runtime.setup import prepare_model_dir, build_estimator, set_flags, get_logger
			
 
				+
			
 
				+
			
 
				+def parse_evaluation_results(result):
			
 
				+    data = {CLASSES[i]: result[CLASSES[i]] for i in range(len(CLASSES))}
			
 
				+    data['MeanDice'] = sum([result[CLASSES[i]] for i in range(len(CLASSES))]) / len(CLASSES)
			
 
				+    data['WholeTumor'] = result['WholeTumor']
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    tf.get_logger().setLevel(logging.ERROR)
			
 
				+    hvd.init()
			
 
				+    params = PARSER.parse_args()
			
 
				+    model_dir = prepare_model_dir(params)
			
 
				+    logger = get_logger(params)
			
 
				+
			
 
				+    dataset = Dataset(data_dir=params.data_dir,
			
 
				+                      batch_size=params.batch_size,
			
 
				+                      fold_idx=params.fold,
			
 
				+                      n_folds=params.num_folds,
			
 
				+                      params=params)
			
 
				+
			
 
				+    estimator = build_estimator(params=params, model_dir=model_dir)
			
 
				+
			
 
				+    max_steps = params.max_steps // (1 if params.benchmark else hvd.size())
			
 
				+
			
 
				+    if 'train' in params.exec_mode:
			
 
				+        training_hooks = get_hooks(params, logger)
			
 
				+        estimator.train(
			
 
				+            input_fn=dataset.train_fn,
			
 
				+            steps=max_steps,
			
 
				+            hooks=training_hooks)
			
 
				+
			
 
				+    if 'evaluate' in params.exec_mode:
			
 
				+        result = estimator.evaluate(input_fn=dataset.eval_fn, steps=dataset.eval_size)
			
 
				+        data = parse_evaluation_results(result)
			
 
				+        if hvd.rank() == 0:
			
 
				+            logger.log(step=(), data=data)
			
 
				+
			
 
				+    if 'predict' == params.exec_mode:
			
 
				+        inference_hooks = get_hooks(params, logger)
			
 
				+        if hvd.rank() == 0:
			
 
				+            count = 1 if not params.benchmark else 2 * params.warmup_steps * params.batch_size // dataset.test_size
			
 
				+            predictions = estimator.predict(
			
 
				+                input_fn=lambda: dataset.test_fn(count=count,
			
 
				+                                                 drop_remainder=params.benchmark), hooks=inference_hooks)
			
 
				+
			
 
				+            for idx, p in enumerate(predictions):
			
 
				+                volume = p['predictions']
			
 
				+                if not params.benchmark:
			
 
				+                    np.save(os.path.join(params.model_dir, "vol_{}.npy".format(idx)), volume)
			
 
				+
			
 
				+    if 'debug_train' == params.exec_mode:
			
 
				+        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
			
 
				+        if hvd.rank() == 0:
			
 
				+            hooks += [TrainingHook(log_every=params.log_every,
			
 
				+                                   logger=logger,
			
 
				+                                   tensor_names=['total_loss_ref:0']),
			
 
				+                      ProfilingHook(warmup_steps=params.warmup_steps,
			
 
				+                                    global_batch_size=hvd.size() * params.batch_size,
			
 
				+                                    logger=logger,
			
 
				+                                    mode='train')]
			
 
				+
			
 
				+        estimator.train(
			
 
				+            input_fn=dataset.synth_train_fn,
			
 
				+            steps=max_steps,
			
 
				+            hooks=hooks)
			
 
				+
			
 
				+    if 'debug_predict' == params.exec_mode:
			
 
				+        if hvd.rank() == 0:
			
 
				+            hooks = [ProfilingHook(warmup_steps=params.warmup_steps,
			
 
				+                                   global_batch_size=params.batch_size,
			
 
				+                                   logger=logger,
			
 
				+                                   mode='inference')]
			
 
				+            count = 2 * params.warmup_steps
			
 
				+            predictions = estimator.predict(input_fn=lambda: dataset.synth_predict_fn(count=count),
			
 
				+                                            hooks=hooks)
			
 
				+            for p in predictions:
			
 
				+                _ = p['predictions']
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    set_flags()
			
 
				+    main()
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/model/layers.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/model/layers.py
@@ -0,0 +1,135 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import tensorflow as tf
			
 
				+
			
 
				+
			
 
				+def _normalization(inputs, name, mode):
			
 
				+    training = mode == tf.estimator.ModeKeys.TRAIN
			
 
				+
			
 
				+    if name == 'instancenorm':
			
 
				+        gamma_initializer = tf.constant_initializer(1.0)
			
 
				+        return tf.contrib.layers.instance_norm(
			
 
				+            inputs,
			
 
				+            center=True,
			
 
				+            scale=True,
			
 
				+            epsilon=1e-6,
			
 
				+            param_initializers={'gamma': gamma_initializer},
			
 
				+            reuse=None,
			
 
				+            variables_collections=None,
			
 
				+            outputs_collections=None,
			
 
				+            trainable=True,
			
 
				+            data_format='NHWC',
			
 
				+            scope=None)
			
 
				+
			
 
				+    if name == 'groupnorm':
			
 
				+        return tf.contrib.layers.group_norm(inputs=inputs,
			
 
				+                                            groups=16,
			
 
				+                                            channels_axis=-1,
			
 
				+                                            reduction_axes=(-4, -3, -2),
			
 
				+                                            activation_fn=None,
			
 
				+                                            trainable=True)
			
 
				+
			
 
				+    if name == 'batchnorm':
			
 
				+        return tf.keras.layers.BatchNormalization(axis=-1,
			
 
				+                                                  trainable=True,
			
 
				+                                                  virtual_batch_size=None)(inputs, training=training)
			
 
				+    elif name == 'none':
			
 
				+        return inputs
			
 
				+    else:
			
 
				+        raise ValueError('Invalid normalization layer')
			
 
				+
			
 
				+
			
 
				+def _activation(x, activation):
			
 
				+    if activation == 'relu':
			
 
				+        return tf.nn.relu(x)
			
 
				+    elif activation == 'leaky_relu':
			
 
				+        return tf.nn.leaky_relu(x, alpha=0.01)
			
 
				+    elif activation == 'sigmoid':
			
 
				+        return tf.nn.sigmoid(x)
			
 
				+    elif activation == 'softmax':
			
 
				+        return tf.nn.softmax(x, axis=-1)
			
 
				+    elif activation == 'none':
			
 
				+        return x
			
 
				+    else:
			
 
				+        raise ValueError("Unknown activation {}".format(activation))
			
 
				+
			
 
				+
			
 
				+def convolution(x,
			
 
				+                out_channels,
			
 
				+                kernel_size=3,
			
 
				+                stride=1,
			
 
				+                mode=tf.estimator.ModeKeys.TRAIN,
			
 
				+                normalization='batchnorm',
			
 
				+                activation='leaky_relu',
			
 
				+                transpose=False):
			
 
				+
			
 
				+    if transpose:
			
 
				+        conv = tf.keras.layers.Conv3DTranspose
			
 
				+    else:
			
 
				+        conv = tf.keras.layers.Conv3D
			
 
				+    regularizer = None#tf.keras.regularizers.l2(1e-5)
			
 
				+
			
 
				+    x = conv(filters=out_channels,
			
 
				+             kernel_size=kernel_size,
			
 
				+             strides=stride,
			
 
				+             activation=None,
			
 
				+             padding='same',
			
 
				+             data_format='channels_last',
			
 
				+             kernel_initializer=tf.glorot_uniform_initializer(),
			
 
				+             kernel_regularizer=regularizer,
			
 
				+             bias_initializer=tf.zeros_initializer(),
			
 
				+             bias_regularizer=regularizer)(x)
			
 
				+
			
 
				+    x = _normalization(x, normalization, mode)
			
 
				+
			
 
				+    return _activation(x, activation)
			
 
				+
			
 
				+
			
 
				+def upsample_block(x, skip_connection, out_channels, normalization, mode):
			
 
				+    x = convolution(x, kernel_size=2, out_channels=out_channels, stride=2,
			
 
				+                    normalization='none', activation='none', transpose=True)
			
 
				+    x = tf.keras.layers.Concatenate(axis=-1)([x, skip_connection])
			
 
				+
			
 
				+    x = convolution(x, out_channels=out_channels, normalization=normalization, mode=mode)
			
 
				+    x = convolution(x, out_channels=out_channels, normalization=normalization, mode=mode)
			
 
				+    return x
			
 
				+
			
 
				+
			
 
				+def input_block(x, out_channels, normalization, mode):
			
 
				+    x = convolution(x, out_channels=out_channels, normalization=normalization, mode=mode)
			
 
				+    x = convolution(x, out_channels=out_channels, normalization=normalization, mode=mode)
			
 
				+    return x
			
 
				+
			
 
				+
			
 
				+def downsample_block(x, out_channels, normalization, mode):
			
 
				+    x = convolution(x, out_channels=out_channels, normalization=normalization, mode=mode, stride=2)
			
 
				+    return convolution(x, out_channels=out_channels, normalization=normalization, mode=mode)
			
 
				+
			
 
				+
			
 
				+def linear_block(x, out_channels, mode, activation='leaky_relu', normalization='none'):
			
 
				+    x = convolution(x, out_channels=out_channels, normalization=normalization, mode=mode)
			
 
				+    return convolution(x, out_channels=out_channels, activation=activation, mode=mode, normalization=normalization)
			
 
				+
			
 
				+
			
 
				+def output_layer(x, out_channels, activation):
			
 
				+    x = tf.keras.layers.Conv3D(out_channels,
			
 
				+                               kernel_size=3,
			
 
				+                               activation=None,
			
 
				+                               padding='same',
			
 
				+                               kernel_regularizer=None,
			
 
				+                               kernel_initializer=tf.glorot_uniform_initializer(),
			
 
				+                               bias_initializer=tf.zeros_initializer(),
			
 
				+                               bias_regularizer=None)(x)
			
 
				+    return _activation(x, activation)
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/model/losses.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/model/losses.py
@@ -0,0 +1,83 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import tensorflow as tf
			
 
				+
			
 
				+
			
 
				+def make_loss(params, y_true, y_pred):
			
 
				+    if params.loss == 'dice':
			
 
				+        return _dice(y_true, y_pred)
			
 
				+    if params.loss == 'ce':
			
 
				+        return _ce(y_true, y_pred)
			
 
				+    if params.loss == 'dice+ce':
			
 
				+        return tf.add(_ce(y_true, y_pred), _dice(y_true, y_pred), name="total_loss_ref")
			
 
				+
			
 
				+    raise ValueError('Unknown loss: {}'.format(params.loss))
			
 
				+
			
 
				+
			
 
				+def _ce(y_true, y_pred):
			
 
				+    return tf.reduce_sum(
			
 
				+        tf.reduce_mean(tf.keras.backend.binary_crossentropy(tf.cast(y_true, tf.float32), y_pred), axis=[0, 1, 2, 3]),
			
 
				+        name='crossentropy_loss_ref')
			
 
				+
			
 
				+
			
 
				+def _dice(y_true, y_pred):
			
 
				+    return tf.reduce_sum(dice_loss(predictions=y_pred, targets=y_true), name='dice_loss_ref')
			
 
				+
			
 
				+
			
 
				+def eval_dice(y_true, y_pred):
			
 
				+    return 1 - dice_loss(predictions=y_pred, targets=y_true)
			
 
				+
			
 
				+
			
 
				+def dice_loss(predictions,
			
 
				+              targets,
			
 
				+              squared_pred=False,
			
 
				+              smooth=1e-5,
			
 
				+              top_smooth=0.0):
			
 
				+    is_channels_first = False
			
 
				+
			
 
				+    n_len = len(predictions.get_shape())
			
 
				+    reduce_axis = list(range(2, n_len)) if is_channels_first else list(range(1, n_len - 1))
			
 
				+    intersection = tf.reduce_sum(targets * predictions, axis=reduce_axis)
			
 
				+
			
 
				+    if squared_pred:
			
 
				+        targets = tf.square(targets)
			
 
				+        predictions = tf.square(predictions)
			
 
				+
			
 
				+    y_true_o = tf.reduce_sum(targets, axis=reduce_axis)
			
 
				+    y_pred_o = tf.reduce_sum(predictions, axis=reduce_axis)
			
 
				+
			
 
				+    denominator = y_true_o + y_pred_o
			
 
				+
			
 
				+    f = (2.0 * intersection + top_smooth) / (denominator + smooth)
			
 
				+
			
 
				+    return 1 - tf.reduce_mean(f, axis=0)
			
 
				+
			
 
				+
			
 
				+def total_dice(predictions,
			
 
				+               targets,
			
 
				+               smooth=1e-5,
			
 
				+               top_smooth=0.0):
			
 
				+    n_len = len(predictions.get_shape())
			
 
				+    reduce_axis = list(range(1, n_len-1))
			
 
				+    targets = tf.reduce_sum(targets, axis=-1)
			
 
				+    predictions = tf.reduce_sum(predictions, axis=-1)
			
 
				+    intersection = tf.reduce_sum(targets * predictions, axis=reduce_axis)
			
 
				+
			
 
				+    y_true_o = tf.reduce_sum(targets, axis=reduce_axis)
			
 
				+    y_pred_o = tf.reduce_sum(predictions, axis=reduce_axis)
			
 
				+
			
 
				+    denominator = y_true_o + y_pred_o
			
 
				+
			
 
				+    return tf.reduce_mean((2.0 * intersection + top_smooth) / (denominator + smooth))
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/model/model_fn.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/model/model_fn.py
@@ -0,0 +1,72 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+import horovod.tensorflow as hvd
			
 
				+import tensorflow as tf
			
 
				+
			
 
				+from model.unet3d import Builder
			
 
				+from model.losses import make_loss, eval_dice, total_dice
			
 
				+from dataset.data_loader import CLASSES
			
 
				+
			
 
				+
			
 
				+def unet_3d(features, labels, mode, params):
			
 
				+
			
 
				+    logits = Builder(n_classes=4, normalization=params.normalization, mode=mode)(features)
			
 
				+
			
 
				+    if mode == tf.estimator.ModeKeys.PREDICT:
			
 
				+        prediction = tf.argmax(input=logits, axis=-1, output_type=tf.dtypes.int32)
			
 
				+        return tf.estimator.EstimatorSpec(mode=mode,
			
 
				+                                          predictions={'predictions': tf.cast(prediction, tf.int8)})
			
 
				+
			
 
				+    labels = tf.cast(labels, tf.float32)
			
 
				+    if not params.include_background:
			
 
				+        labels = labels[..., 1:]
			
 
				+        logits = logits[..., 1:]
			
 
				+
			
 
				+    if mode == tf.estimator.ModeKeys.EVAL:
			
 
				+        eval_acc = eval_dice(y_true=labels, y_pred=tf.round(logits))
			
 
				+        total_eval_acc = total_dice(tf.round(logits), labels)
			
 
				+        metrics = {CLASSES[i]: tf.metrics.mean(eval_acc[i]) for i in range(eval_acc.shape[-1])}
			
 
				+        metrics['WholeTumor'] = tf.metrics.mean(total_eval_acc)
			
 
				+        return tf.estimator.EstimatorSpec(mode=mode, loss=tf.reduce_mean(eval_acc),
			
 
				+                                          eval_metric_ops=metrics)
			
 
				+
			
 
				+    loss = make_loss(params, y_pred=logits, y_true=labels)
			
 
				+    loss = tf.identity(loss, name="total_loss_ref")
			
 
				+
			
 
				+    global_step = tf.compat.v1.train.get_or_create_global_step()
			
 
				+
			
 
				+    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=params.learning_rate)
			
 
				+    optimizer = hvd.DistributedOptimizer(optimizer)
			
 
				+
			
 
				+    # NGC has TF_ENABLE_AUTO_MIXED_PRECISION enabled by default. We cannot use
			
 
				+    # both graph_rewrite and envar, so if we're not in NGC we do graph_rewrite
			
 
				+    try:
			
 
				+        amp_envar = int(os.environ['TF_ENABLE_AUTO_MIXED_PRECISION']) == 1
			
 
				+    except KeyError:
			
 
				+        amp_envar = False
			
 
				+
			
 
				+    if params.use_amp and not amp_envar:
			
 
				+        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
			
 
				+            optimizer,
			
 
				+            loss_scale='dynamic'
			
 
				+        )
			
 
				+
			
 
				+    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
			
 
				+        train_op = optimizer.minimize(loss, global_step=global_step)
			
 
				+
			
 
				+    return tf.estimator.EstimatorSpec(
			
 
				+        mode=mode, loss=loss, train_op=train_op)
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/model/unet3d.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/model/unet3d.py
@@ -0,0 +1,82 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+from model.layers import downsample_block, upsample_block, output_layer, input_block
			
 
				+
			
 
				+
			
 
				+class Builder:
			
 
				+    def __init__(self, n_classes, mode, normalization='none'):
			
 
				+        self._n_classes = n_classes
			
 
				+        self._mode = mode
			
 
				+        self._normalization = normalization
			
 
				+
			
 
				+    def __call__(self, features):
			
 
				+        skip_128 = input_block(x=features,
			
 
				+                               out_channels=32,
			
 
				+                               normalization=self._normalization,
			
 
				+                               mode=self._mode)
			
 
				+
			
 
				+        skip_64 = downsample_block(x=skip_128,
			
 
				+                                   out_channels=64,
			
 
				+                                   normalization=self._normalization,
			
 
				+                                   mode=self._mode)
			
 
				+
			
 
				+        skip_32 = downsample_block(x=skip_64,
			
 
				+                                   out_channels=128,
			
 
				+                                   normalization=self._normalization,
			
 
				+                                   mode=self._mode)
			
 
				+
			
 
				+        skip_16 = downsample_block(x=skip_32,
			
 
				+                                   out_channels=256,
			
 
				+                                   normalization=self._normalization,
			
 
				+                                   mode=self._mode)
			
 
				+
			
 
				+        skip_8 = downsample_block(x=skip_16,
			
 
				+                                  out_channels=320,
			
 
				+                                  normalization=self._normalization,
			
 
				+                                  mode=self._mode)
			
 
				+
			
 
				+        x = downsample_block(x=skip_8,
			
 
				+                             out_channels=320,
			
 
				+                             normalization=self._normalization,
			
 
				+                             mode=self._mode)
			
 
				+
			
 
				+        x = upsample_block(x, skip_8,
			
 
				+                           out_channels=320,
			
 
				+                           normalization=self._normalization,
			
 
				+                           mode=self._mode)
			
 
				+
			
 
				+        x = upsample_block(x, skip_16,
			
 
				+                           out_channels=256,
			
 
				+                           normalization=self._normalization,
			
 
				+                           mode=self._mode)
			
 
				+
			
 
				+        x = upsample_block(x, skip_32,
			
 
				+                           out_channels=128,
			
 
				+                           normalization=self._normalization,
			
 
				+                           mode=self._mode)
			
 
				+
			
 
				+        x = upsample_block(x, skip_64,
			
 
				+                           out_channels=64,
			
 
				+                           normalization=self._normalization,
			
 
				+                           mode=self._mode)
			
 
				+
			
 
				+        x = upsample_block(x, skip_128,
			
 
				+                           out_channels=32,
			
 
				+                           normalization=self._normalization,
			
 
				+                           mode=self._mode)
			
 
				+
			
 
				+        return output_layer(x=x,
			
 
				+                            out_channels=self._n_classes,
			
 
				+                            activation='softmax')
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/requirements.txt
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/requirements.txt
@@ -0,0 +1 @@
 
				+nibabel
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/runtime/arguments.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/runtime/arguments.py
@@ -0,0 +1,48 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import argparse
			
 
				+
			
 
				+PARSER = argparse.ArgumentParser(description="UNet-3D")
			
 
				+
			
 
				+# Estimator flags
			
 
				+PARSER.add_argument('--model_dir', required=True, type=str)
			
 
				+PARSER.add_argument('--exec_mode', choices=['train', 'evaluate', 'train_and_evaluate',
			
 
				+                                            'predict', 'debug_train', 'debug_predict'], type=str)
			
 
				+
			
 
				+# Training flags
			
 
				+PARSER.add_argument('--benchmark', dest='benchmark', action='store_true', default=False)
			
 
				+PARSER.add_argument('--max_steps', default=16000, type=int)
			
 
				+PARSER.add_argument('--learning_rate', default=0.0002, type=float)
			
 
				+PARSER.add_argument('--log_every', default=100, type=int)
			
 
				+PARSER.add_argument('--log_dir', type=str)
			
 
				+PARSER.add_argument('--loss', choices=['dice', 'ce', 'dice+ce'], default='dice+ce', type=str)
			
 
				+PARSER.add_argument('--warmup_steps', default=40, type=int)
			
 
				+PARSER.add_argument('--normalization', choices=['instancenorm', 'batchnorm', 'groupnorm'],
			
 
				+                    default='instancenorm', type=str)
			
 
				+PARSER.add_argument('--include_background', dest='include_background', action='store_true', default=False)
			
 
				+PARSER.add_argument('--resume_training', dest='resume_training', action='store_true', default=False)
			
 
				+
			
 
				+# Augmentations
			
 
				+PARSER.add_argument('--augment', dest='augment', action='store_true', default=False)
			
 
				+
			
 
				+# Dataset flags
			
 
				+PARSER.add_argument('--data_dir', required=True, type=str)
			
 
				+PARSER.add_argument('--batch_size', default=1, type=int)
			
 
				+PARSER.add_argument('--fold', default=0, type=int)
			
 
				+PARSER.add_argument('--num_folds', default=5, type=int)
			
 
				+
			
 
				+# Tensorflow configuration flags
			
 
				+PARSER.add_argument('--use_amp', '--amp', dest='use_amp', action='store_true', default=False)
			
 
				+PARSER.add_argument('--use_xla', '--xla', dest='use_xla', action='store_true', default=False)
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/runtime/hooks.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/runtime/hooks.py
@@ -0,0 +1,110 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import time
			
 
				+
			
 
				+import numpy as np
			
 
				+import tensorflow as tf
			
 
				+import horovod.tensorflow as hvd
			
 
				+
			
 
				+
			
 
				+def get_hooks(params, logger):
			
 
				+    if 'train' in params.exec_mode:
			
 
				+        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
			
 
				+        if hvd.rank() == 0:
			
 
				+            if params.benchmark:
			
 
				+                hooks += [ProfilingHook(warmup_steps=params.warmup_steps,
			
 
				+                                        global_batch_size=hvd.size() * params.batch_size,
			
 
				+                                        logger=logger,
			
 
				+                                        mode='train')]
			
 
				+            else:
			
 
				+                hooks += [TrainingHook(log_every=params.log_every,
			
 
				+                                       logger=logger,
			
 
				+                                       tensor_names=['total_loss_ref:0'])]
			
 
				+        return hooks
			
 
				+
			
 
				+    elif 'predict' == params.exec_mode:
			
 
				+        hooks = []
			
 
				+        if hvd.rank() == 0:
			
 
				+            if params.benchmark:
			
 
				+                hooks += [ProfilingHook(warmup_steps=params.warmup_steps,
			
 
				+                                        global_batch_size=params.batch_size,
			
 
				+                                        logger=logger,
			
 
				+                                        mode='test')]
			
 
				+            return hooks
			
 
				+
			
 
				+
			
 
				+class ProfilingHook(tf.estimator.SessionRunHook):
			
 
				+    def __init__(self, warmup_steps, global_batch_size, logger, mode):
			
 
				+        self._warmup_steps = warmup_steps
			
 
				+        self._global_batch_size = global_batch_size
			
 
				+        self._step = 0
			
 
				+        self._timestamps = []
			
 
				+        self._logger = logger
			
 
				+        self._mode = mode
			
 
				+
			
 
				+    def before_run(self, run_context):
			
 
				+        self._step += 1
			
 
				+        if self._step >= self._warmup_steps:
			
 
				+            self._timestamps.append(time.time())
			
 
				+
			
 
				+    def end(self, session):
			
 
				+        deltas = np.array([self._timestamps[i + 1] - self._timestamps[i] for i in range(len(self._timestamps) - 1)])
			
 
				+        stats = process_performance_stats(np.array(deltas),
			
 
				+                                          self._global_batch_size,
			
 
				+                                          self._mode)
			
 
				+
			
 
				+        self._logger.log(step=(), data={metric: float(value) for (metric, value) in stats})
			
 
				+        self._logger.flush()
			
 
				+
			
 
				+
			
 
				+class TrainingHook(tf.estimator.SessionRunHook):
			
 
				+    def __init__(self, log_every, logger, tensor_names):
			
 
				+        self._log_every = log_every
			
 
				+        self._step = 0
			
 
				+        self._logger = logger
			
 
				+        self._tensor_names = tensor_names
			
 
				+
			
 
				+    def before_run(self, run_context):
			
 
				+        run_args = tf.train.SessionRunArgs(
			
 
				+            fetches=self._tensor_names
			
 
				+        )
			
 
				+
			
 
				+        return run_args
			
 
				+
			
 
				+    def after_run(self,
			
 
				+                  run_context,
			
 
				+                  run_values):
			
 
				+        if self._step % self._log_every == 0:
			
 
				+            for i in range(len(self._tensor_names)):
			
 
				+                self._logger.log(step=(self._step,), data={self._tensor_names[i]: str(run_values.results[i])})
			
 
				+        self._step += 1
			
 
				+
			
 
				+    def end(self, session):
			
 
				+        self._logger.flush()
			
 
				+
			
 
				+
			
 
				+def process_performance_stats(timestamps, batch_size, mode):
			
 
				+    timestamps_ms = 1000 * timestamps
			
 
				+    latency_ms = timestamps_ms.mean()
			
 
				+    std = timestamps_ms.std()
			
 
				+    n = np.sqrt(len(timestamps_ms))
			
 
				+    throughput_imgps = (1000.0 * batch_size / timestamps_ms).mean()
			
 
				+
			
 
				+    stats = [("throughput_{}".format(mode), str(throughput_imgps)),
			
 
				+             ('latency_{}:'.format(mode), str(latency_ms))]
			
 
				+    for ci, lvl in zip(["90%:", "95%:", "99%:"],
			
 
				+                       [1.645, 1.960, 2.576]):
			
 
				+        stats.append(("Latency_{} ".format(mode) + ci, str(latency_ms + lvl * std / n)))
			
 
				+    return stats
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/runtime/parse_results.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/runtime/parse_results.py
@@ -0,0 +1,66 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import os
			
 
				+import argparse
			
 
				+
			
 
				+
			
 
				+def parse_convergence_results(path, environment):
			
 
				+    whole_tumor = []
			
 
				+    tumor_core = []
			
 
				+    peritumoral_edema = []
			
 
				+    enhancing_tumor = []
			
 
				+    mean_dice = []
			
 
				+    logfiles = [f for f in os.listdir(path) if "log" in f and environment in f]
			
 
				+    if not logfiles:
			
 
				+        raise FileNotFoundError("No logfile found at {}".format(path))
			
 
				+    for logfile in logfiles:
			
 
				+        with open(os.path.join(path, logfile), "r") as f:
			
 
				+            content = f.readlines()
			
 
				+        if "TumorCore" not in content[-1]:
			
 
				+            print("Evaluation score not found. The file", logfile, "might be corrupted.")
			
 
				+            continue
			
 
				+        content = content[-1].split("()")[1]
			
 
				+        whole_tumor.append(float([val for val in content.split("  ")
			
 
				+                                  if "WholeTumor" in val][0].split()[-1]))
			
 
				+        tumor_core.append(float([val for val in content.split("  ")
			
 
				+                                 if "TumorCore" in val][0].split()[-1]))
			
 
				+        peritumoral_edema.append(float([val for val in content.split("  ")
			
 
				+                                        if "PeritumoralEdema" in val][0].split()[-1]))
			
 
				+        enhancing_tumor.append(float([val for val in content.split("  ")
			
 
				+                                      if "EnhancingTumor" in val][0].split()[-1]))
			
 
				+        mean_dice.append(float([val for val in content.split("  ")
			
 
				+                                if "MeanDice" in val][0].split()[-1]))
			
 
				+
			
 
				+    if whole_tumor:
			
 
				+        print("Evaluation average dice score:", sum(mean_dice) / len(mean_dice))
			
 
				+        print("Evaluation whole tumor dice score:", sum(whole_tumor) / len(whole_tumor))
			
 
				+        print("Evaluation tumor core dice score:", sum(tumor_core) / len(tumor_core))
			
 
				+        print("Evaluation peritumoral edema dice score:", sum(peritumoral_edema) / len(peritumoral_edema))
			
 
				+        print("Evaluation enhancing tumor dice score:", sum(enhancing_tumor) / len(enhancing_tumor))
			
 
				+    else:
			
 
				+        print("All logfiles were corrupted, no loss was obtained.")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    parser.add_argument('--model_dir',
			
 
				+                        type=str,
			
 
				+                        required=True)
			
 
				+    parser.add_argument('--env',
			
 
				+                        type=str,
			
 
				+                        required=True)
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+    parse_convergence_results(path=args.model_dir, environment=args.env)
			
--- a/TensorFlow/Segmentation/UNet_3D_Medical/runtime/setup.py
+++ b/TensorFlow/Segmentation/UNet_3D_Medical/runtime/setup.py
@@ -0,0 +1,84 @@
 
				+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import os
			
 
				+import pickle
			
 
				+import shutil
			
 
				+
			
 
				+import dllogger as logger
			
 
				+import tensorflow as tf
			
 
				+import horovod.tensorflow as hvd
			
 
				+from dllogger import StdOutBackend, Verbosity, JSONStreamBackend
			
 
				+
			
 
				+from model.model_fn import unet_3d
			
 
				+
			
 
				+
			
 
				+def set_flags():
			
 
				+    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
			
 
				+    os.environ['CUDA_CACHE_DISABLE'] = '1'
			
 
				+    os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
			
 
				+    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
			
 
				+    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '0'
			
 
				+    os.environ['TF_ADJUST_HUE_FUSED'] = '1'
			
 
				+    os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
			
 
				+    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
			
 
				+    os.environ['TF_SYNC_ON_FINISH'] = '0'
			
 
				+
			
 
				+
			
 
				+def prepare_model_dir(params):
			
 
				+    model_dir = os.path.join(params.model_dir, "model_chckpt")
			
 
				+    model_dir = model_dir if (hvd.rank() == 0 and not params.benchmark) else None
			
 
				+    if model_dir is not None:
			
 
				+        os.makedirs(model_dir, exist_ok=True)
			
 
				+        if ('train' in params.exec_mode) and (not params.resume_training):
			
 
				+            os.system('rm -rf {}/*'.format(model_dir))
			
 
				+
			
 
				+    return model_dir
			
 
				+
			
 
				+
			
 
				+def build_estimator(params, model_dir):
			
 
				+    config = tf.compat.v1.ConfigProto(gpu_options=tf.compat.v1.GPUOptions(), allow_soft_placement=True)
			
 
				+
			
 
				+    if params.use_xla:
			
 
				+        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
			
 
				+
			
 
				+    config.gpu_options.allow_growth = True
			
 
				+    config.gpu_options.visible_device_list = str(hvd.local_rank())
			
 
				+
			
 
				+    if params.use_amp:
			
 
				+        config.graph_options.rewrite_options.auto_mixed_precision = 1
			
 
				+
			
 
				+    checkpoint_steps = (params.max_steps // hvd.size()) if hvd.rank() == 0 else None
			
 
				+    checkpoint_steps = checkpoint_steps if not params.benchmark else None
			
 
				+    run_config = tf.estimator.RunConfig(
			
 
				+        save_summary_steps=params.max_steps,
			
 
				+        session_config=config,
			
 
				+        save_checkpoints_steps=checkpoint_steps,
			
 
				+        keep_checkpoint_max=1)
			
 
				+
			
 
				+    return tf.estimator.Estimator(
			
 
				+        model_fn=unet_3d,
			
 
				+        model_dir=model_dir,
			
 
				+        config=run_config,
			
 
				+        params=params)
			
 
				+
			
 
				+
			
 
				+def get_logger(params):
			
 
				+    backends = []
			
 
				+    if hvd.rank() == 0:
			
 
				+        backends += [StdOutBackend(Verbosity.VERBOSE)]
			
 
				+        if params.log_dir:
			
 
				+            backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)]
			
 
				+    logger.init(backends=backends)
			
 
				+    return logger