3 سال پیش · 4da7bf28b6
--- a/PyTorch/Segmentation/nnUNet/Dockerfile
+++ b/PyTorch/Segmentation/nnUNet/Dockerfile
@@ -1,17 +1,17 @@
 
				-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.11-py3
			
 
				+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.11-py3
			
 
				 FROM ${FROM_IMAGE_NAME} 
			
 
				 
			
 
				 ADD ./requirements.txt .
			
 
				 RUN pip install --disable-pip-version-check -r requirements.txt
			
 
				-RUN pip install monai==0.8.1 --no-dependencies
			
 
				-RUN pip uninstall -y torchtext
			
 
				+RUN pip install monai==1.0.0 --no-dependencies
			
 
				 RUN pip install numpy --upgrade
			
 
				-RUN pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/ nvidia-dali-cuda110==1.16.0
			
 
				 
			
 
				 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
			
 
				 RUN unzip -qq awscliv2.zip
			
 
				 RUN ./aws/install
			
 
				 RUN rm -rf awscliv2.zip aws
			
 
				 
			
 
				+ENV OMP_NUM_THREADS=2
			
 
				 WORKDIR /workspace/nnunet_pyt
			
 
				 ADD . /workspace/nnunet_pyt 
			
 
				+RUN cp utils/instance_norm.py /usr/local/lib/python3.8/dist-packages/apex/normalization
			
--- a/PyTorch/Segmentation/nnUNet/README.md
+++ b/PyTorch/Segmentation/nnUNet/README.md
@@ -31,19 +31,19 @@ This repository provides a script and recipe to train the nnU-Net model to achie
 
				     * [Results](#results)
			
 
				         * [Training accuracy results](#training-accuracy-results)             
			
 
				             * [Training accuracy: NVIDIA DGX A100 (8x A100 80G)](#training-accuracy-nvidia-dgx-a100-8x-a100-80g)
			
 
				-            * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g)
			
 
				+            * [Training accuracy: NVIDIA DGX-1 (8x V100 32G)](#training-accuracy-nvidia-dgx-1-8x-v100-32g)
			
 
				         * [Training performance results](#training-performance-results)
			
 
				             * [Training performance: NVIDIA DGX A100 (8x A100 80G)](#training-performance-nvidia-dgx-a100-8x-a100-80g) 
			
 
				-            * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
			
 
				+            * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
			
 
				         * [Inference performance results](#inference-performance-results)
			
 
				             * [Inference performance: NVIDIA DGX A100 (1x A100 80G)](#inference-performance-nvidia-dgx-a100-1x-a100-80g)
			
 
				-            * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
			
 
				+            * [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32g)
			
 
				 - [Release notes](#release-notes)
			
 
				     * [Changelog](#changelog)
			
 
				     * [Known issues](#known-issues)
			
 
				 
			
 
				 ## Model overview
			
 
				-    
			
 
				+ 
			
 
				 The nnU-Net ("no-new-Net") refers to a robust and self-adapting framework for U-Net based medical image segmentation. This repository contains a nnU-Net implementation as described in the paper: [nnU-Net: Self-adapting Framework for U-Net-Based Medical Image Segmentation](https://arxiv.org/abs/1809.10486). 
			
 
				 
			
 
				 The differences between this nnU-net and [original model](https://github.com/MIC-DKFZ/nnUNet) are:
			
@@ -53,27 +53,27 @@ The differences between this nnU-net and [original model](https://github.com/MIC
 
				 
			
 
				 This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
			
 
				 
			
 
				-We developed the model using [PyTorch Lightning](https://www.pytorchlightning.ai), a new easy to use framework that ensures code readability and reproducibility without the boilerplate.
			
 
				+We developed the model using [PyTorch Lightning](https://www.pytorchlightning.ai), a new easy-to-use framework that ensures code readability and reproducibility without the boilerplate.
			
 
				 
			
 
				 ### Model architecture
			
 
				-    
			
 
				-The nnU-Net allows training two types of networks: 2D U-Net and 3D U-Net to perform semantic segmentation of 3D images, with high accuracy and performance.
			
 
				+ 
			
 
				+The nnU-Net allows the training of two types of networks: 2D U-Net and 3D U-Net to perform semantic segmentation of 3D images, with high accuracy and performance.
			
 
				 
			
 
				-The following figure shows the architecture of the 3D U-Net model and its different components. U-Net is composed of a contractive and an expanding path, that aims at building a bottleneck in its centremost part through a combination of convolution, instance norm and leaky relu operations. After this bottleneck, the image is reconstructed through a combination of convolutions and upsampling. Skip connections are added with the goal of helping the backward flow of gradients in order to improve the training.
			
 
				+The following figure shows the architecture of the 3D U-Net model and its different components. U-Net is composed of a contractive and an expanding path, that aims at building a bottleneck in its centremost part through a combination of convolution, instance norm, and leaky ReLU operations. After this bottleneck, the image is reconstructed through a combination of convolutions and upsampling. Skip connections are added with the goal of helping the backward flow of gradients to improve the training.
			
 
				 
			
 
				 <img src="images/unet3d.png" width="900"/>
			
 
				-    
			
 
				+ 
			
 
				 *Figure 1: The 3D U-Net architecture*
			
 
				 
			
 
				 ### Default configuration
			
 
				 
			
 
				-All convolution blocks in U-Net in both encoder and decoder are using two convolution layers followed by instance normalization and a leaky ReLU nonlinearity. For downsampling we are using stride convolution whereas transposed convolution for upsampling.
			
 
				+All convolution blocks in U-Net in both encoder and decoder are using two convolution layers followed by instance normalization and a leaky ReLU nonlinearity. For downsampling, we are using stride convolution whereas transposed convolution is used for upsampling.
			
 
				 
			
 
				-All models were trained with Adam optimizer. For loss function we use the average of [cross-entropy](https://en.wikipedia.org/wiki/Cross_entropy) and [dice coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient).
			
 
				+All models were trained with an Adam optimizer. For loss function we use the average of [cross-entropy](https://en.wikipedia.org/wiki/Cross_entropy) and [dice coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient).
			
 
				 
			
 
				-Early stopping is triggered if validation dice score wasn't improved during the last 100 epochs.
			
 
				+Early stopping is triggered if the validation dice score wasn't improved during the last 100 epochs.
			
 
				 
			
 
				-Used data augmentation: crop with oversampling the foreground class, mirroring, zoom, Gaussian noise, Gaussian blur, brightness, contrast.
			
 
				+Used data augmentation: crop with oversampling the foreground class, mirroring, zoom, Gaussian noise, Gaussian blur, brightness, and contrast.
			
 
				 
			
 
				 ### Feature support matrix
			
 
				 
			
@@ -83,26 +83,26 @@ The following features are supported by this model:
 
				 |-----------------------|--------------------------   
			
 
				 |[DALI](https://docs.nvidia.com/deeplearning/dali/release-notes/index.html) | Yes
			
 
				 |Automatic mixed precision (AMP)   | Yes  
			
 
				-|Distributed data parallel (DDP)   | Yes
			
 
				-         
			
 
				+|Distributed data-parallel (DDP)   | Yes
			
 
				+ 
			
 
				 #### Features
			
 
				 
			
 
				 **DALI**
			
 
				 
			
 
				-NVIDIA DALI - DALI is a library accelerating data preparation pipeline. To accelerate your input pipeline, you only need to define your data loader
			
 
				+NVIDIA DALI - DALI is a library-accelerating data preparation pipeline. To speed up your input pipeline, you only need to define your data loader
			
 
				 with the DALI library. For details, see example sources in this repository or see the [DALI documentation](https://docs.nvidia.com/deeplearning/dali/index.html)
			
 
				 
			
 
				 **Automatic Mixed Precision (AMP)**
			
 
				 
			
 
				-This implementation uses native PyTorch AMP implementation of mixed precision training. It allows us to use FP16 training with FP32 master weights by modifying just a few lines of code.
			
 
				+This implementation uses native PyTorch AMP implementation of mixed precision training. It allows us to use FP16 training with FP32 master weights by modifying a few lines of code.
			
 
				 
			
 
				 **DistributedDataParallel (DDP)**
			
 
				 
			
 
				 The model uses PyTorch Lightning implementation of distributed data parallelism at the module level which can run across multiple machines.
			
 
				-    
			
 
				+ 
			
 
				 ### Mixed precision training
			
 
				 
			
 
				-Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
			
 
				+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to keep as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x speedup on the most intense model architectures. Using mixed precision training requires two steps:
			
 
				 
			
 
				 1. Porting the model to use the FP16 data type where appropriate.
			
 
				 2. Adding loss scaling to preserve small gradient values.
			
@@ -115,14 +115,14 @@ For information about:
 
				 * APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
			
 
				 
			
 
				 #### Enabling mixed precision
			
 
				-    
			
 
				+ 
			
 
				 For training and inference, mixed precision can be enabled by adding the `--amp` flag. Mixed precision is using [native PyTorch implementation](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/).
			
 
				 
			
 
				 #### TF32
			
 
				 
			
 
				 TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
			
 
				 
			
 
				-TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
			
 
				+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require a high dynamic range for weights or activations.
			
 
				 
			
 
				 For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
			
 
				 
			
@@ -132,32 +132,32 @@ TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by defaul
 
				 
			
 
				 **Test time augmentation**
			
 
				 
			
 
				-Test time augmentation is an inference technique which averages predictions from augmented images with its prediction. As a result, predictions are more accurate, but with the cost of slower inference process. For nnU-Net, we use all possible flip combinations for image augmenting. Test time augmentation can be enabled by adding the `--tta` flag.
			
 
				+Test time augmentation is an inference technique that averages predictions from augmented images with its prediction. As a result, predictions are more accurate, but with the cost of a slower inference process. For nnU-Net, we use all possible flip combinations for image augmenting. Test time augmentation can be enabled by adding the `--tta` flag.
			
 
				 
			
 
				 ## Setup
			
 
				 
			
 
				-The following section lists the requirements that you need to meet in order to start training the nnU-Net model.
			
 
				+The following section lists the requirements that you need to meet to start training the nnU-Net model.
			
 
				 
			
 
				 ### Requirements
			
 
				 
			
 
				 This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
			
 
				--   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
			
 
				--   PyTorch 21.11 NGC container
			
 
				+- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
			
 
				+-   PyTorch 22.11 NGC container
			
 
				 -   Supported GPUs:
			
 
				-    - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
			
 
				-    - [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
			
 
				-    - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
			
 
				+ - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
			
 
				+ - [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
			
 
				+ - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
			
 
				 
			
 
				 For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
			
 
				--   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
			
 
				--   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
			
 
				+- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
			
 
				+- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
			
 
				 -   Running [PyTorch](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html#running)
			
 
				-  
			
 
				+ 
			
 
				 For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
			
 
				 
			
 
				 ## Quick Start Guide
			
 
				 
			
 
				-To train your model using mixed or TF32 precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the nnUNet model on the [Medical Segmentation Decathlon](http://medicaldecathlon.com/) dataset. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
			
 
				+To train your model using mixed or TF32 precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the nnUNet model on the [Medical Segmentation Decathlon](http://medicaldecathlon.com/) dataset. For the specifics on training and inference, see the [Advanced](#advanced) section.
			
 
				 
			
 
				 1. Clone the repository.
			
 
				 
			
@@ -166,27 +166,27 @@ Executing this command will create your local repository with all the code to ru
 
				 git clone https://github.com/NVIDIA/DeepLearningExamples
			
 
				 cd DeepLearningExamples/PyTorch/Segmentation/nnUNet
			
 
				 ```
			
 
				-    
			
 
				+ 
			
 
				 2. Build the nnU-Net PyTorch NGC container.
			
 
				-    
			
 
				+ 
			
 
				 This command will use the Dockerfile to create a Docker image named `nnunet`, downloading all the required components automatically.
			
 
				 
			
 
				 ```
			
 
				 docker build -t nnunet .
			
 
				 ```
			
 
				-    
			
 
				+ 
			
 
				 The NGC container contains all the components optimized for usage on NVIDIA hardware.
			
 
				-    
			
 
				+ 
			
 
				 3. Start an interactive session in the NGC container to run preprocessing/training/inference.
			
 
				-    
			
 
				+ 
			
 
				 The following command will launch the container and mount the `./data` directory as a volume to the `/data` directory inside the container, and `./results` directory to the `/results` directory in the container.
			
 
				-    
			
 
				+ 
			
 
				 ```
			
 
				 mkdir data results
			
 
				-docker run -it --runtime=nvidia --shm-size=8g --ulimit memlock=-1 --ulimit stack=67108864 --rm -v ${PWD}/data:/data -v ${PWD}/results:/results nnunet:latest /bin/bash
			
 
				+docker run -it --privileged --runtime=nvidia --shm-size=8g --ulimit memlock=-1 --ulimit stack=67108864 --rm -v ${PWD}/data:/data -v ${PWD}/results:/results nnunet:latest /bin/bash
			
 
				 ```
			
 
				 
			
 
				-4. Prepare BraTS dataset.
			
 
				+4. Prepare the BraTS dataset.
			
 
				 
			
 
				 To download and preprocess the data run:
			
 
				 ```
			
@@ -200,13 +200,13 @@ Then `ls /data` should print:
 
				 01_3d 01_2d Task01_BrainTumour
			
 
				 ```
			
 
				 
			
 
				-For the specifics concerning data preprocessing, see the [Getting the data](#getting-the-data) section.
			
 
				-    
			
 
				+For the specifics on data preprocessing, see the [Getting the data](#getting-the-data) section.
			
 
				+ 
			
 
				 5. Start training.
			
 
				-   
			
 
				+ 
			
 
				 Training can be started with:
			
 
				 ```
			
 
				-python scripts/train.py --gpus <gpus> --fold <fold> --dim <dim> [--amp]
			
 
				+python scripts/train.py --gpus <gpus> --fold <fold> --dim <dim> [--amp] [--bind]
			
 
				 ```
			
 
				 
			
 
				 To see descriptions of the train script arguments run `python scripts/train.py --help`. You can customize the training process. For details, see the [Training process](#training-process) section.
			
@@ -216,14 +216,14 @@ To see descriptions of the train script arguments run `python scripts/train.py -
 
				 The training and inference performance can be evaluated by using benchmarking scripts, such as:
			
 
				  
			
 
				 ```
			
 
				-python scripts/benchmark.py --mode {train,predict} --gpus <ngpus> --dim {2,3} --batch_size <bsize> [--amp] 
			
 
				+python scripts/benchmark.py --mode {train,predict} --gpus <ngpus> --dim {2,3} --batch_size <bsize> [--amp] [--bind]
			
 
				 ```
			
 
				 
			
 
				 To see descriptions of the benchmark script arguments run `python scripts/benchmark.py --help`.
			
 
				 
			
 
				 
			
 
				 7. Start inference/predictions.
			
 
				-   
			
 
				+ 
			
 
				 Inference can be started with:
			
 
				 ```
			
 
				 python scripts/inference.py --data <path/to/data> --dim <dim> --fold <fold> --ckpt_path <path/to/checkpoint> [--amp] [--tta] [--save_preds]
			
@@ -236,7 +236,7 @@ python preprocess.py --task 01 --dim 3 --exec_mode val
 
				 python scripts/inference.py --data /data/01_3d/val --dim 3 --fold 0 --ckpt_path <path/to/checkpoint> --amp --tta --save_preds
			
 
				 ```
			
 
				 
			
 
				-Then if you have labels for predicted images you can evaluate it with `evaluate.py` script. For example:
			
 
				+Then if you have labels for predicted images you can evaluate them with `evaluate.py` script. For example:
			
 
				 
			
 
				 ```
			
 
				 python evaluate.py --preds /results/preds_task_01_dim_3_fold_0_tta --lbls /data/Task01_BrainTumour/labelsTr
			
@@ -245,7 +245,7 @@ python evaluate.py --preds /results/preds_task_01_dim_3_fold_0_tta --lbls /data/
 
				 To see descriptions of the inference script arguments run `python scripts/inference.py --help`. You can customize the inference process. For details, see the [Inference process](#inference-process) section.
			
 
				 
			
 
				 Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark yours performance to [Training performance benchmark](#training-performance-results), or [Inference performance benchmark](#inference-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
			
 
				-    
			
 
				+ 
			
 
				 ## Advanced
			
 
				 
			
 
				 The following sections provide greater details of the dataset, running training and inference, and the training results.
			
@@ -259,25 +259,24 @@ In the root directory, the most important files are:
 
				 * `download.py`: Downloads given dataset from [Medical Segmentation Decathlon](http://medicaldecathlon.com/).
			
 
				 * `Dockerfile`: Container with the basic set of dependencies to run nnU-Net.
			
 
				 * `requirements.txt:` Set of extra requirements for running nnU-Net.
			
 
				-* `evaluate.py`: Compare predictions with ground truth and get final score.
			
 
				-    
			
 
				+* `evaluate.py`: Compare predictions with ground truth and get the final score.
			
 
				+ 
			
 
				 The `data_preprocessing` folder contains information about the data preprocessing used by nnU-Net. Its contents are:
			
 
				-    
			
 
				+ 
			
 
				 * `configs.py`: Defines dataset configuration like patch size or spacing.
			
 
				 * `preprocessor.py`: Implements data preprocessing pipeline.
			
 
				-* `convert2tfrec.py`: Implements conversion from NumPy files to tfrecords.
			
 
				-    
			
 
				+ 
			
 
				 The `data_loading` folder contains information about the data pipeline used by nnU-Net. Its contents are:
			
 
				-    
			
 
				+ 
			
 
				 * `data_module.py`: Defines `LightningDataModule` used by PyTorch Lightning.
			
 
				 * `dali_loader.py`: Implements DALI data loader.
			
 
				-    
			
 
				+ 
			
 
				 The `nnunet` folder contains information about the building blocks of nnU-Net and the way they are assembled. Its contents are:
			
 
				-    
			
 
				+ 
			
 
				 * `metrics.py`: Implements dice metric
			
 
				 * `loss.py`: Implements loss function.
			
 
				 * `nn_unet.py`: Implements training/validation/test logic and dynamic creation of U-Net architecture used by nnU-Net.
			
 
				-    
			
 
				+ 
			
 
				 The `utils` folder includes:
			
 
				 
			
 
				 * `args.py`: Defines command line arguments.
			
@@ -286,16 +285,16 @@ The `utils` folder includes:
 
				 
			
 
				 The `notebooks` folder includes:
			
 
				 
			
 
				-* `BraTS21.ipynb`: Notebook with our solution for BraTS21 challenge.
			
 
				-* `custom_dataset.ipynb`: Notebook which demonstrates how to use nnU-Net with custom dataset.
			
 
				+* `BraTS21.ipynb`: Notebook with our solution ranked 3 for the BraTS21 challenge.
			
 
				+* `BraTS22.ipynb`: Notebook with our solution ranked 2 for the BraTS22 challenge.
			
 
				+* `custom_dataset.ipynb`: Notebook which demonstrates how to use nnU-Net with the custom dataset.
			
 
				 
			
 
				 Other folders included in the root directory are:
			
 
				 
			
 
				 * `images/`: Contains a model diagram.
			
 
				-* `scripts/`: Provides scripts for training, benchmarking and inference of nnU-Net.
			
 
				+* `scripts/`: Provides scripts for training, benchmarking, and inference of nnU-Net.
			
 
				 
			
 
				 ### Command-line options
			
 
				-
			
 
				 To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
			
 
				 
			
 
				 `python main.py --help`
			
@@ -386,11 +385,11 @@ The nnU-Net model was trained on the [Medical Segmentation Decathlon](http://med
 
				 
			
 
				 #### Dataset guidelines
			
 
				 
			
 
				-To train nnU-Net you will need to preprocess your dataset as a first step with `preprocess.py` script. Run `python scripts/preprocess.py --help` to see descriptions of the preprocess script arguments.
			
 
				+To train nnU-Net you will need to preprocess your dataset as the first step with `preprocess.py` script. Run `python scripts/preprocess.py --help` to see descriptions of the preprocess script arguments.
			
 
				 
			
 
				 For example to preprocess data for 3D U-Net run: `python preprocess.py --task 01 --dim 3`.
			
 
				 
			
 
				-In `data_preprocessing/configs.py` for each [Medical Segmentation Decathlon](http://medicaldecathlon.com/) task there are defined: patch size, precomputed spacings and statistics for CT datasets.
			
 
				+In `data_preprocessing/configs.py` for each [Medical Segmentation Decathlon](http://medicaldecathlon.com/) task, there are defined: patch sizes, precomputed spacings and statistics for CT datasets.
			
 
				 
			
 
				 The preprocessing pipeline consists of the following steps:
			
 
				 
			
@@ -398,42 +397,42 @@ The preprocessing pipeline consists of the following steps:
 
				 2. Resampling to the median voxel spacing of their respective dataset (exception for anisotropic datasets where the lowest resolution axis is selected to be the 10th percentile of the spacings).
			
 
				 3. Padding volumes so that dimensions are at least as patch size.
			
 
				 4. Normalizing:
			
 
				-    * For CT modalities the voxel values are clipped to 0.5 and 99.5 percentiles of the foreground voxels and then data is normalized with mean and standard deviation from collected from foreground voxels.
			
 
				-    * For MRI modalities z-score normalization is applied.
			
 
				+ * For CT modalities the voxel values are clipped to 0.5 and 99.5 percentiles of the foreground voxels and then data is normalized with mean and standard deviation collected from foreground voxels.
			
 
				+ * For MRI modalities z-score normalization is applied.
			
 
				 
			
 
				 #### Multi-dataset
			
 
				 
			
 
				-It is possible to run nnUNet on custom dataset. If your dataset correspond to [Medical Segmentation Decathlon](http://medicaldecathlon.com/) (i.e. data should be `NIfTi` format and there should be `dataset.json` file where you need to provide fields: modality, labels and at least one of training, test) you need to perform the following:
			
 
				+It is possible to run nnUNet on a custom dataset. If your dataset corresponds to [Medical Segmentation Decathlon](http://medicaldecathlon.com/) (i.e. data should be in `NIfTi` format and there should be `dataset.json` file where you need to provide fields: modality, labels, and at least one of training, test) you need to perform the following:
			
 
				 
			
 
				-1. Mount your dataset to `/data` directory.
			
 
				- 
			
 
				+1. Mount your dataset to the `/data` directory.
			
 
				+ 
			
 
				 2. In `data_preprocessing/config.py`:
			
 
				-    - Add to the `task_dir` dictionary your dataset directory name. For example, for Brain Tumour dataset, it corresponds to `"01": "Task01_BrainTumour"`.
			
 
				-    - Add the patch size that you want to use for training to the `patch_size` dictionary. For example, for Brain Tumour dataset it corresponds to `"01_3d": [128, 128, 128]` for 3D U-Net and `"01_2d": [192, 160]` for 2D U-Net. There are three types of suffixes `_3d, _2d` they correspond to 3D UNet and 2D U-Net.
			
 
				+ - Add to the `task_dir` dictionary your dataset directory name. For example, for the Brain Tumour dataset, it corresponds to `"01": "Task01_BrainTumour"`.
			
 
				+ - Add the patch size that you want to use for training to the `patch_size` dictionary. For example, for Brain Tumour dataset it corresponds to `"01_3d": [128, 128, 128]` for 3D U-Net and `"01_2d": [192, 160]` for 2D U-Net. There are three types of suffixes `_3d, _2d` they correspond to 3D UNet and 2D U-Net.
			
 
				 
			
 
				-3. Preprocess your data with `preprocess.py` scripts. For example, to preprocess Brain Tumour dataset for 2D U-Net you should run `python preprocess.py --task 01 --dim 2`.
			
 
				+3. Preprocess your data with `preprocess.py` scripts. For example, to preprocess the Brain Tumour dataset for 2D U-Net you should run `python preprocess.py --task 01 --dim 2`.
			
 
				 
			
 
				-If you have dataset in other format or you want customize data preprocessing or data loading see `notebooks/custom_dataset.ipynb`.
			
 
				+If you have a dataset in another format or you want to customize data preprocessing or data loading see `notebooks/custom_dataset.ipynb`.
			
 
				 
			
 
				 ### Training process
			
 
				 
			
 
				 The model trains for at least `--min_epochs` and at most `--max_epochs` epochs. After each epoch evaluation, the validation set is done and validation loss is monitored for early stopping (see `--patience` flag). Default training settings are:
			
 
				-* Adam optimizer with learning rate of 0.0008 and weight decay 0.0001.
			
 
				+* Adam optimizer with a learning rate of 0.0008 and weight decay of 0.0001.
			
 
				 * Training batch size is set to 2 for 3D U-Net and 16 for 2D U-Net.
			
 
				-    
			
 
				-This default parametrization is applied when running scripts from the `scripts/` directory and when running `main.py` without explicitly overriding these parameters. By default, the training is in full precision. To enable AMP, pass the `--amp` flag. AMP can be enabled for every mode of execution.
			
 
				+ 
			
 
				+This default parametrization is applied when running scripts from the `scripts` directory and when running `main.py` without overriding these parameters. By default, the training is in full precision. To enable AMP, pass the `--amp` flag. AMP can be enabled for every mode of execution.
			
 
				 
			
 
				-The default configuration minimizes a function `L = (1 - dice_coefficient) + cross_entropy` during training and reports achieved convergence as [dice coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) per class. The training, with a combination of dice and cross entropy has been proven to achieve better convergence than a training using only dice.
			
 
				+The default configuration minimizes a function `L = (1 - dice_coefficient) + cross_entropy` during training and reports achieved convergence as [dice coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) per class. The training, with a combination of dice and cross-entropy has been proven to achieve better convergence than training using only dice.
			
 
				 
			
 
				-The training can be run directly without using the predefined scripts. The name of the training script is `main.py`. For example:
			
 
				+The training can be run without using the predefined scripts. The name of the training script is `main.py`. For example:
			
 
				 
			
 
				 ```
			
 
				 python main.py --exec_mode train --task 01 --fold 0 --gpus 1 --amp
			
 
				 ```
			
 
				-  
			
 
				+ 
			
 
				 Training artifacts will be saved to `/results` in the container. Some important artifacts are:
			
 
				-* `/results/logs.json`: Collected dice scores and loss values evaluated after each epoch during training on validation set.
			
 
				-* `/results/checkpoints`: Saved checkpoints. By default, two checkpoints are saved - one after each epoch ('last.ckpt') and one with the highest validation dice (e.g 'epoch=5.ckpt' for if highest dice was at 5th epoch).
			
 
				+* `/results/logs.json`: Collected dice scores and loss values evaluated after each epoch during training on the validation set.
			
 
				+* `/results/checkpoints`: Saved checkpoints. By default, two checkpoints are saved - one after each epoch ('last.ckpt') and one with the highest validation dice (e.g 'epoch=5.ckpt' for if the highest dice was at the 5th epoch).
			
 
				 
			
 
				 To load the pretrained model provide `--ckpt_path <path/to/checkpoint>`.
			
 
				 
			
@@ -451,7 +450,7 @@ The script will then:
 
				 * Run inference on the preprocessed validation dataset corresponding to fold 0
			
 
				 * Print achieved score to the console
			
 
				 * If `--save_preds` is provided then resulting masks in the NumPy format will be saved in the `/results` directory
			
 
				-                       
			
 
				+  
			
 
				 ## Performance
			
 
				 
			
 
				 ### Benchmarking
			
@@ -460,13 +459,13 @@ The following section shows how to run benchmarks to measure the model performan
 
				 
			
 
				 #### Training performance benchmark
			
 
				 
			
 
				-To benchmark training, run `scripts/benchmark.py` script with `--mode train`:
			
 
				+To benchmark training, run the `scripts/benchmark.py` script with `--mode train`:
			
 
				 
			
 
				 ```
			
 
				-python scripts/benchmark.py --mode train --gpus <ngpus> --dim {2,3} --batch_size <bsize> [--amp] 
			
 
				+python scripts/benchmark.py --mode train --gpus <ngpus> --dim {2,3} --batch_size <bsize> [--amp] [--bind]
			
 
				 ```
			
 
				 
			
 
				-For example, to benchmark 3D U-Net training using mixed-precision on 8 GPUs with batch size of 2, run:
			
 
				+For example, to benchmark 3D U-Net training using mixed-precision on 8 GPUs with a batch size of 2, run:
			
 
				 
			
 
				 ```
			
 
				 python scripts/benchmark.py --mode train --gpus 8 --dim 3 --batch_size 2 --amp
			
@@ -478,22 +477,25 @@ At the end of the script, a line reporting the best train throughput and latency
 
				 
			
 
				 #### Inference performance benchmark
			
 
				 
			
 
				-To benchmark inference, run `scripts/benchmark.py` script with `--mode predict`:
			
 
				+To benchmark inference, run the `scripts/benchmark.py` script with `--mode predict`:
			
 
				 
			
 
				 ```
			
 
				 python scripts/benchmark.py --mode predict --dim {2,3} --batch_size <bsize> [--amp]
			
 
				 ```
			
 
				 
			
 
				-For example, to benchmark inference using mixed-precision for 3D U-Net, with batch size of 4, run:
			
 
				+For example, to benchmark inference using mixed-precision for 3D U-Net, with a batch size of 4, run:
			
 
				 
			
 
				 ```
			
 
				 python scripts/benchmark.py --mode predict --dim 3 --amp --batch_size 4
			
 
				 ```
			
 
				 
			
 
				-Each of these scripts will by default run warm-up for 1 data pass and start inference benchmarking during the second pass.
			
 
				+Each of these scripts will by default run a warm-up for 1 data pass and start inference benchmarking during the second pass.
			
 
				 
			
 
				 At the end of the script, a line reporting the inference throughput and latency will be printed.
			
 
				 
			
 
				+*Note that this benchmark reports performance numbers for iterations over samples with fixed patch sizes.
			
 
				+The real inference process uses sliding window for input images with arbitrary resolution and performance may vary for images with different resolutions.*
			
 
				+
			
 
				 ### Results
			
 
				 
			
 
				 The following sections provide details on how to achieve the same performance and accuracy in training and inference.
			
@@ -502,63 +504,77 @@ The following sections provide details on how to achieve the same performance an
 
				 
			
 
				 ##### Training accuracy: NVIDIA DGX A100 (8x A100 80G)
			
 
				 
			
 
				-Our results were obtained by running the `python scripts/train.py --gpus {1,8} --fold {0,1,2,3,4} --dim {2,3} [--amp]` training scripts and averaging results in the PyTorch 21.11 NGC container on NVIDIA DGX with (8x A100 80G) GPUs.
			
 
				+Our results were obtained by running the `python scripts/train.py --gpus {1,8} --fold {0,1,2,3,4} --dim {2,3} [--amp] [--bind] --learning_rate lr --seed n` training scripts and averaging results in the PyTorch 22.11 NGC container on NVIDIA DGX with (8x A100 80G) GPUs.
			
 
				+
			
 
				+Note: We recommend using `--bind` flag for multi-GPU settings to increase the throughput. To launch multi-GPU with `--bind` use PyTorch distributed launcher, e.g., `python -m torch.distributed.launch --use_env --nproc_per_node=8 scripts/benchmark.py --mode train --gpus 8 --dim 3 --amp --batch_size 2 --bind` for the interactive session, or use regular command when launching with SLURM's sbatch.
			
 
				 
			
 
				-| Dimension | GPUs | Batch size / GPU  | Accuracy - mixed precision | Accuracy - TF32 | Time to train - mixed precision | Time to train - TF32|  Time to train speedup (TF32 to mixed precision)        
			
 
				+| Dimension | GPUs | Batch size / GPU  | Dice - mixed precision | Dice - TF32 | Time to train - mixed precision | Time to train - TF32|  Time to train speedup (TF32 to mixed precision)        
			
 
				 |:-:|:-:|:--:|:-----:|:-----:|:-----:|:-----:|:----:|
			
 
				 | 2 | 1 | 2  | 73.21 | 73.11 | 33 min| 48 min| 1.46 |
			
 
				 | 2 | 8 | 2  | 73.15 | 73.16 |  9 min| 13 min| 1.44 |
			
 
				 | 3 | 1 | 2  | 74.35 | 74.34 |104 min|167 min| 1.61 |
			
 
				 | 3 | 8 | 2  | 74.30 | 74.32 |  23min| 36 min| 1.57 |
			
 
				 
			
 
				-##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
			
 
				+Reported dice score is the average over 5 folds from the best run for grid search over learning rates {1e-4, 2e-4, ..., 9e-4} and seed {1, 3, 5}.
			
 
				 
			
 
				-Our results were obtained by running the `python scripts/train.py --gpus {1,8} --fold {0,1,2,3,4} --dim {2,3} [--amp]` training scripts and averaging results in the PyTorch 21.11 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
			
 
				+##### Training accuracy: NVIDIA DGX-1 (8x V100 32G)
			
 
				 
			
 
				-| Dimension | GPUs | Batch size / GPU | Accuracy - mixed precision |  Accuracy - FP32 |  Time to train - mixed precision | Time to train - FP32  | Time to train speedup (FP32 to mixed precision)        
			
 
				+Our results were obtained by running the `python scripts/train.py --gpus {1,8} --fold {0,1,2,3,4} --dim {2,3} [--amp] [--bind] --seed n ` training scripts and averaging results in the PyTorch 22.11 NGC container on NVIDIA DGX-1 with (8x V100 32G) GPUs.
			
 
				+
			
 
				+Note: We recommend using `--bind` flag for multi-GPU settings to increase the throughput. To launch multi-GPU with `--bind` use PyTorch distributed launcher, e.g., `python -m torch.distributed.launch --use_env --nproc_per_node=8 scripts/benchmark.py --mode train --gpus 8 --dim 3 --amp --batch_size 2 --bind` for the interactive session, or use regular command when launching with SLURM's sbatch.
			
 
				+
			
 
				+| Dimension | GPUs | Batch size / GPU | Dice - mixed precision |  Dice - FP32 |  Time to train - mixed precision | Time to train - FP32  | Time to train speedup (FP32 to mixed precision)        
			
 
				 |:-:|:-:|:--:|:-----:|:-----:|:-----:|:-----:|:----:|
			
 
				 | 2 | 1 | 2  | 73.18 | 73.22 | 60 min|114 min| 1.90 |
			
 
				 | 2 | 8 | 2  | 73.15 | 73.18 | 13 min| 19 min| 1.46 |
			
 
				 | 3 | 1 | 2  | 74.31 | 74.33 |201 min|680 min| 3.38 |
			
 
				 | 3 | 8 | 2  | 74.35 | 74.39 | 41 min|153 min| 3.73 |
			
 
				 
			
 
				+Reported dice score is the average over 5 folds from the best run for grid search over learning rates {1e-4, 2e-4, ..., 9e-4} and seed {1, 3, 5}.
			
 
				+
			
 
				 #### Training performance results
			
 
				 
			
 
				 ##### Training performance: NVIDIA DGX A100 (8x A100 80G)
			
 
				 
			
 
				 Our results were obtained by running the `python scripts/benchmark.py --mode train --gpus {1,8} --dim {2,3} --batch_size <bsize> [--amp]` training script in the NGC container on NVIDIA DGX A100 (8x A100 80G) GPUs. Performance numbers (in volumes per second) were averaged over an entire training epoch.
			
 
				 
			
 
				+Note: We recommend using `--bind` flag for multi-gpu settings to increase the througput. To launch multi-GPU with `--bind` use `python -m torch.distributed.launch --use_env --nproc_per_node=<npgus> scripts/train.py --bind ...` for the interactive session, or use regular command when launching with SLURM's sbatch.
			
 
				+
			
 
				 | Dimension | GPUs | Batch size / GPU  | Throughput - mixed precision [img/s] | Throughput - TF32 [img/s] | Throughput speedup (TF32 - mixed precision) | Weak scaling - mixed precision | Weak scaling - TF32 |
			
 
				 |:-:|:-:|:--:|:------:|:------:|:-----:|:-----:|:-----:|
			
 
				-| 2 | 1 | 64 | 1129.48 | 702.82 | 1.607 | N/A | N/A |
			
 
				-| 2 | 1 | 128 | 1234.69 | 741.01 | 1.666 | N/A | N/A |
			
 
				-| 2 | 8 | 64 | 7015.45 | 4613.27 | 1.521 | 6.211 | 6.564 |
			
 
				-| 2 | 8 | 128 | 8293.61 | 5498.78 | 1.508 | 6.717 | 7.421 |
			
 
				-| 3 | 1 | 1 | 13.92 | 9.22 | 1.509 | N/A | N/A |
			
 
				-| 3 | 1 | 2 | 17.68 | 10.72 | 1.649 | N/A | N/A |
			
 
				-| 3 | 1 | 4 | 20.56 | 11.5 | 1.787 | N/A | N/A |
			
 
				-| 3 | 8 | 1 | 92.97 | 61.68 | 1.416 | 6.679 | 7.119 |
			
 
				-| 3 | 8 | 2 | 114.47 | 72.23 | 1.475 | 6.475 | 7.242 |
			
 
				-| 3 | 8 | 4 | 140.55 | 85.53 | 1.643 | 6.836 | 7.437 |
			
 
				+|	2	|	1	|	32	|	1040.58	|	732.22	|	1.42	|	-	|	-	|
			
 
				+|	2	|	1	|	64	|	1238.68	|	797.37	|	1.55	|	-	|	-	|
			
 
				+|	2	|	1	|	128	|	1345.29	|	838.38	|	1.60	|	-	|	-	|
			
 
				+|	2	|	8	|	32	|	7747.27	|	5588.2	|	1.39	|	7.45	|	7.60	|
			
 
				+|	2	|	8	|	64	|	9417.27	|	6246.95	|	1.51	|	7.60	|	8.04	|
			
 
				+|	2	|	8	|	128	|	10694.1	|	6631.08	|	1.61	|	7.95	|	7.83	|
			
 
				+|	3	|	1	|	1	|	24.61	|	9.66	|	2.55	|	-	|	-	|
			
 
				+|	3	|	1	|	2	|	27.48	|	11.27	|	2.44	|	-	|	-	|
			
 
				+|	3	|	1	|	4	|	29.96	|	12.22	|	2.45	|	-	|	-	|
			
 
				+|	3	|	8	|	1	|	187.07	|	76.44	|	2.45	|	7.63	|	7.91	|
			
 
				+|	3	|	8	|	2	|	220.83	|	88.67	|	2.49	|	7.83	|	7.87	|
			
 
				+|	3	|	8	|	4	|	234.5	|	96.61	|	2.43	|	7.91	|	7.91	|
			
 
				 
			
 
				 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
			
 
				 
			
 
				-##### Training performance: NVIDIA DGX-1 (8x V100 16G)
			
 
				+##### Training performance: NVIDIA DGX-1 (8x V100 32G)
			
 
				+
			
 
				+Our results were obtained by running the `python scripts/benchmark.py --mode train --gpus {1,8} --dim {2,3} --batch_size <bsize> [--amp] [--bind]` training script in the PyTorch 22.11 NGC container on NVIDIA DGX-1 with (8x V100 32G) GPUs. Performance numbers (in volumes per second) were averaged over an entire training epoch.
			
 
				 
			
 
				-Our results were obtained by running the `python scripts/benchmark.py --mode train --gpus {1,8} --dim {2,3} --batch_size <bsize> [--amp]` training script in the PyTorch 21.11 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in volumes per second) were averaged over an entire training epoch.
			
 
				+Note: We recommend using `--bind` flag for multi-gpu settings to increase the througput. To launch multi-GPU with `--bind` use `python -m torch.distributed.launch --use_env --nproc_per_node=<npgus> scripts/train.py --bind ...` for the interactive session, or use regular command when launching with SLURM's sbatch.
			
 
				 
			
 
				 | Dimension | GPUs | Batch size / GPU | Throughput - mixed precision [img/s] | Throughput - FP32 [img/s] | Throughput speedup (FP32 - mixed precision) | Weak scaling - mixed precision | Weak scaling - FP32 |
			
 
				 |:-:|:-:|:---:|:---------:|:-----------:|:--------:|:---------:|:-------------:|
			
 
				-| 2 | 1 | 64 | 607.16 | 298.84 | 2.032 | N/A | N/A |
			
 
				-| 2 | 1 | 128 | 653.44 | 307.01 | 2.128 | N/A | N/A |
			
 
				-| 2 | 8 | 64 | 4058.79 | 2196.05 | 1.848 | 6.685 | 7.349 |
			
 
				-| 2 | 8 | 128 | 4649.37 | 2388.46 | 1.848 | 7.115 | 7.779 |
			
 
				-| 3 | 1 | 1 | 8.66 | 1.99 | 4.352 | N/A | N/A |
			
 
				-| 3 | 1 | 2 | 9.65 | 2.07 | 4.662 | N/A | N/A |
			
 
				-| 3 | 1 | 4 | 9.99 | OOM | N/A | N/A | N/A |
			
 
				-| 3 | 8 | 1 | 58.45 | 15.55 | 3.756 | 6.749 | 7.819 |
			
 
				-| 3 | 8 | 2 | 66.03 | 16.22 | 4.071 | 6.842 | 7.835 |
			
 
				-| 3 | 8 | 4 | 67.37 | OOM | N/A | 6.743 | N/A |
			
 
				+|	2	|	1	|	32	|	561.6	|	310.21	|	1.81	|	-	|	-	|
			
 
				+|	2	|	1	|	64	|	657.91	|	326.02	|	2.02	|	-	|	-	|
			
 
				+|	2	|	1	|	128	|	706.92	|	332.81	|	2.12	|	-	|	-	|
			
 
				+|	2	|	8	|	32	|	3903.88	|	2396.88	|	1.63	|	6.95	|	7.73	|
			
 
				+|	2	|	8	|	64	|	4922.76	|	2590.66	|	1.90	|	7.48	|	7.95	|
			
 
				+|	2	|	8	|	128	|	5597.87	|	2667.56	|	2.10	|	7.92	|	8.02	|
			
 
				+|	3	|	1	|	1	|	11.38	|	2.07	|	5.50	|	-	|	-	|
			
 
				+|	3	|	1	|	2	|	12.34	|	2.51	|	4.92	|	-	|	-	|
			
 
				+|	3	|	8	|	1	|	84.38	|	16.55	|	5.10	|	7.41	|	8.00	|
			
 
				+|	3	|	8	|	2	|	98.17	|	20.15	|	4.87	|	7.96	|	8.03	|
			
 
				 
			
 
				 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
			
 
				 
			
@@ -566,57 +582,58 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
				 
			
 
				 ##### Inference performance: NVIDIA DGX A100 (1x A100 80G)
			
 
				 
			
 
				-Our results were obtained by running the `python scripts/benchmark.py --mode predict --dim {2,3} --batch_size <bsize> [--amp]` inferencing benchmarking script in the PyTorch 21.11 NGC container on NVIDIA DGX A100 (1x A100 80G) GPU.
			
 
				+Our results were obtained by running the `python scripts/benchmark.py --mode predict --dim {2,3} --batch_size <bsize> [--amp]` inferencing benchmarking script in the PyTorch 22.11 NGC container on NVIDIA DGX A100 (1x A100 80G) GPU.
			
 
				 
			
 
				 FP16
			
 
				 
			
 
				 | Dimension | Batch size |  Resolution  | Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
			
 
				 |:----------:|:---------:|:-------------:|:----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
			
 
				-| 2 | 64 | 4x192x160 | 3211.23 | 19.93 | 20.24 | 20.38 | 20.84 |
			
 
				-| 2 | 128 | 4x192x160 | 3465.45 | 36.94 | 38.35 | 38.72 | 38.95 |
			
 
				-| 3 | 1 | 4x128x128x128 | 41.93 | 23.85 | 24.40 | 24.61 | 24.99 |
			
 
				-| 3 | 2 | 4x128x128x128 | 44.24 | 45.21 | 47.08 | 47.38 | 48.24 |
			
 
				-| 3 | 4 | 4x128x128x128 | 45.81 | 87.31 | 88.13 | 88.56 | 89.69 |
			
 
				+|	2	|	32	|	192x160	|	1818.05	|	17.6	| 19.86 | 20.38 | 20.98 |
			
 
				+|	2	|	64	|	192x160	|	3645.16	|	17.56	| 19.86 | 20.82 | 23.66 |
			
 
				+|	2	|	128	|	192x160	|	3850.35	|	33.24	| 34.72 | 61.4 | 63.58 |
			
 
				+|	3	|	1	|	128x128x128	|	68.45	|	14.61	| 17.02 | 17.41 | 19.27 |
			
 
				+|	3	|	2	|	128x128x128	|	56.9	|	35.15	| 40.9 | 43.15 | 57.94 |
			
 
				+|	3	|	4	|	128x128x128	|	76.39	|	52.36	| 57.9 | 59.52 | 70.24 |
			
 
				 
			
 
				 TF32
			
 
				 
			
 
				 | Dimension | Batch size |  Resolution  | Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
			
 
				 |:----------:|:---------:|:-------------:|:----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
			
 
				-| 2 | 64 | 4x192x160 | 2172.38 | 29.46 | 29.94 | 30.03 | 30.19 |
			
 
				-| 2 | 128 | 4x192x160 | 1769.56 | 72.34 | 72.84 | 73.04 | 74.79 |
			
 
				-| 3 | 1 | 4x128x128x128 | 23.83 | 41.97 | 42.71 | 42.76 | 42.87 |
			
 
				-| 3 | 2 | 4x128x128x128 | 26.75 | 74.77 | 75.79 | 76.06 | 77.04 |
			
 
				-| 3 | 4 | 4x128x128x128 | 27.10 | 147.62 | 147.81 | 149.14 | 190.08 |
			
 
				-
			
 
				+|	2	|	32	|	192x160	|	1868.56	|	17.13	| 51.75 | 53.07 | 54.92 |
			
 
				+|	2	|	64	|	192x160	|	2508.57	|	25.51	| 56.83 | 90.08 | 96.87 |
			
 
				+|	2	|	128	|	192x160	|	2609.6	|	49.05	| 191.48 | 201.8 | 205.29 |
			
 
				+|	3	|	1	|	128x128x128	|	35.02	|	28.55	| 51.75 | 53.07 | 54.92 |
			
 
				+|	3	|	2	|	128x128x128	|	39.88	|	50.15	| 56.83 | 90.08 | 96.87 |
			
 
				+|	3	|	4	|	128x128x128	|	41.32	|	96.8	| 191.48 | 201.8 | 205.29 |
			
 
				 
			
 
				 Throughput is reported in images per second. Latency is reported in milliseconds per batch.
			
 
				 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
			
 
				 
			
 
				+##### Inference performance: NVIDIA DGX-1 (1x V100 32G)
			
 
				 
			
 
				-##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
			
 
				-
			
 
				-Our results were obtained by running the `python scripts/benchmark.py --mode predict --dim {2,3} --batch_size <bsize> [--amp]` inferencing benchmarking script in the PyTorch 21.11 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPU.
			
 
				+Our results were obtained by running the `python scripts/benchmark.py --mode predict --dim {2,3} --batch_size <bsize> [--amp]` inferencing benchmarking script in the PyTorch 22.11 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPU.
			
 
				 
			
 
				 FP16
			
 
				  
			
 
				 | Dimension | Batch size |Resolution| Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
			
 
				 |:----------:|:---------:|:-------------:|:----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
			
 
				-| 2 | 64 | 4x192x160 | 1809.79 | 35.36 | 35.75 | 35.84 | 36.21 |
			
 
				-| 2 | 128 | 4x192x160 | 1987.91 | 64.39 | 64.79 | 64.87 | 65.01 |
			
 
				-| 3 | 1 | 4x128x128x128 | 26.75 | 37.38 | 37.66 | 37.74 | 38.17 |
			
 
				-| 3 | 2 | 4x128x128x128 | 23.28 | 85.91 | 86.77 | 87.39 | 89.54 |
			
 
				-| 3 | 4 | 4x128x128x128 | 23.83 | 167.83 | 169.41 | 170.30 | 173.47 |
			
 
				+|	2	|	32	|	192x160	|	1254.38	|	25.51	| 29.07 | 30.07 | 31.23 |
			
 
				+|	2	|	64	|	192x160	|	2024.13	|	31.62	| 71.51 | 71.78 | 72.44 |
			
 
				+|	2	|	128	|	192x160	|	2136.95	|	59.9	| 61.23 | 61.63 | 110.13 |
			
 
				+|	3	|	1	|	128x128x128	|	36.93	|	27.08	| 28.6 | 31.43 | 48.3 |
			
 
				+|	3	|	2	|	128x128x128	|	38.86	|	51.47	| 53.3 | 54.77 | 92.49 |
			
 
				+|	3	|	4	|	128x128x128	|	39.15	|	102.18	| 104.62 | 112.17 | 180.47 |
			
 
				 
			
 
				 FP32
			
 
				  
			
 
				 | Dimension | Batch size |Resolution| Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
			
 
				 |:----------:|:---------:|:-------------:|:----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
			
 
				-| 2 | 64 | 4x192x160 | 1007.91 | 63.50 | 63.93 | 64.03 | 64.19 |
			
 
				-| 2 | 128 | 4x192x160 | 812.08 | 157.62 | 159.02 | 159.72 | 161.24 |
			
 
				-| 3 | 1 | 4x128x128x128 | 8.23 | 121.45 | 122.84 | 123.93 | 124.69 |
			
 
				-| 3 | 2 | 4x128x128x128 | 8.42 | 237.65 | 239.90 | 240.60 | 242.85 |
			
 
				-| 3 | 4 | 4x128x128x128 | 8.37 | 478.01 | 482.70 | 483.43 | 484.84 |
			
 
				-
			
 
				+|	2	|	32	|	192x160	|	1019.97	|	31.37	| 32.93 | 55.58 | 69.14 |
			
 
				+|	2	|	64	|	192x160	|	1063.59	|	60.17	| 62.32 | 63.11 | 111.01 |
			
 
				+|	2	|	128	|	192x160	|	1069.81	|	119.65	| 123.48 | 123.83 | 225.46 |
			
 
				+|	3	|	1	|	128x128x128	|	9.92	|	100.78	| 103.2 | 103.62 | 111.97 |
			
 
				+|	3	|	2	|	128x128x128	|	10.14	|	197.33	| 201.05 | 201.4 | 201.79 |
			
 
				+|	3	|	4	|	128x128x128	|	10.25	|	390.33	| 398.21 | 399.34 | 401.05 |
			
 
				 
			
 
				 Throughput is reported in images per second. Latency is reported in milliseconds per batch.
			
 
				 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
			
@@ -625,19 +642,27 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
				 
			
 
				 ### Changelog
			
 
				 
			
 
				+November 2022
			
 
				+- Container updated to 22.11
			
 
				+- Add support for 3D channel last convolutions
			
 
				+- Add support for nvFuser Instance Normalization
			
 
				+- Add support for GPU binding
			
 
				+
			
 
				+October 2022
			
 
				+- Add Jupyter Notebook with BraTS'22 solution (ranked 2)
			
 
				+
			
 
				 December 2021
			
 
				-- Container updated to 21.11
			
 
				+- Container updated to 22.11
			
 
				 - Use MONAI DynUNet instead of custom U-Net implementation
			
 
				-- Add balanced multi-gpu evaluation
			
 
				+- Add balanced multi-GPU evaluation
			
 
				 - Support for evaluation with resampled volumes to original shape
			
 
				 
			
 
				 October 2021
			
 
				-- Add Jupyter Notebook with BraTS solution
			
 
				+- Add Jupyter Notebook with BraTS'21 solution (ranked 3)
			
 
				 
			
 
				 May 2021
			
 
				-
			
 
				 - Add Triton Inference Server support
			
 
				-- Removed deep supervision, attention and drop block
			
 
				+- Removed deep supervision, attention, and drop block
			
 
				 
			
 
				 March 2021
			
 
				 - Container updated to 21.02
			
--- a/PyTorch/Segmentation/nnUNet/data_loading/dali_loader.py
+++ b/PyTorch/Segmentation/nnUNet/data_loading/dali_loader.py
@@ -36,10 +36,12 @@ class GenericPipeline(Pipeline):
 
				         self.kwargs = kwargs
			
 
				         self.dim = kwargs["dim"]
			
 
				         self.device = device_id
			
 
				+        self.layout = kwargs["layout"]
			
 
				         self.patch_size = kwargs["patch_size"]
			
 
				         self.load_to_gpu = kwargs["load_to_gpu"]
			
 
				         self.input_x = self.get_reader(kwargs["imgs"])
			
 
				         self.input_y = self.get_reader(kwargs["lbls"]) if kwargs["lbls"] is not None else None
			
 
				+        self.cdhw2dhwc = ops.Transpose(device="gpu", perm=[1, 2, 3, 0])
			
 
				 
			
 
				     def get_reader(self, data):
			
 
				         return ops.readers.Numpy(
			
@@ -67,6 +69,10 @@ class GenericPipeline(Pipeline):
 
				             return img, lbl
			
 
				         return img
			
 
				 
			
 
				+    def make_dhwc_layout(self, img, lbl):
			
 
				+        img, lbl = self.cdhw2dhwc(img), self.cdhw2dhwc(lbl)
			
 
				+        return img, lbl
			
 
				+
			
 
				     def crop(self, data):
			
 
				         return fn.crop(data, crop=self.patch_size, out_of_bounds_policy="pad")
			
 
				 
			
@@ -154,6 +160,8 @@ class TrainPipeline(GenericPipeline):
 
				         img = self.contrast_fn(img)
			
 
				         if self.dim == 2:
			
 
				             img, lbl = self.transpose_fn(img, lbl)
			
 
				+        if self.layout == "NDHWC" and self.dim == 3:
			
 
				+            img, lbl = self.make_dhwc_layout(img, lbl)
			
 
				         return img, lbl
			
 
				 
			
 
				 
			
@@ -171,6 +179,8 @@ class EvalPipeline(GenericPipeline):
 
				             meta = self.input_meta(name="ReaderM")
			
 
				             orig_lbl = self.input_orig_y(name="ReaderO")
			
 
				             return img, lbl, meta, orig_lbl
			
 
				+        if self.layout == "NDHWC" and self.dim == 3:
			
 
				+            img, lbl = self.make_dhwc_layout(img, lbl)
			
 
				         return img, lbl
			
 
				 
			
 
				 
			
@@ -204,6 +214,8 @@ class BenchmarkPipeline(GenericPipeline):
 
				         img, lbl = self.crop_fn(img, lbl)
			
 
				         if self.dim == 2:
			
 
				             img, lbl = self.transpose_fn(img, lbl)
			
 
				+        if self.layout == "NDHWC" and self.dim == 3:
			
 
				+            img, lbl = self.make_dhwc_layout(img, lbl)
			
 
				         return img, lbl
			
 
				 
			
 
				 
			
@@ -250,6 +262,10 @@ def fetch_dali_loader(imgs, lbls, batch_size, mode, **kwargs):
 
				         pipe_kwargs.update({"patch_size": [batch_size_2d] + kwargs["patch_size"]})
			
 
				 
			
 
				     rank = int(os.getenv("LOCAL_RANK", "0"))
			
 
				+    if mode == "eval":  # We sharded the data for evaluation manually.
			
 
				+        rank = 0
			
 
				+        pipe_kwargs["gpus"] = 1
			
 
				+
			
 
				     pipe = pipeline(batch_size, kwargs["num_workers"], rank, **pipe_kwargs)
			
 
				     return LightningWrapper(
			
 
				         pipe,
			
--- a/PyTorch/Segmentation/nnUNet/data_loading/data_module.py
+++ b/PyTorch/Segmentation/nnUNet/data_loading/data_module.py
@@ -34,6 +34,7 @@ class DataModule(LightningDataModule):
 
				             "seed": self.args.seed,
			
 
				             "gpus": self.args.gpus,
			
 
				             "nvol": self.args.nvol,
			
 
				+            "layout": self.args.layout,
			
 
				             "overlap": self.args.overlap,
			
 
				             "benchmark": self.args.benchmark,
			
 
				             "num_workers": self.args.num_workers,
			
@@ -57,6 +58,11 @@ class DataModule(LightningDataModule):
 
				             self.kwargs.update({"orig_lbl": orig_lbl, "meta": meta})
			
 
				             self.train_imgs, self.train_lbls = get_split(imgs, train_idx), get_split(lbls, train_idx)
			
 
				             self.val_imgs, self.val_lbls = get_split(imgs, val_idx), get_split(lbls, val_idx)
			
 
				+
			
 
				+            if self.args.gpus > 1:
			
 
				+                rank = int(os.getenv("LOCAL_RANK", "0"))
			
 
				+                self.val_imgs = self.val_imgs[rank :: self.args.gpus]
			
 
				+                self.val_lbls = self.val_lbls[rank :: self.args.gpus]
			
 
				         else:
			
 
				             self.kwargs.update({"meta": test_meta})
			
 
				         print0(f"{len(self.train_imgs)} training, {len(self.val_imgs)} validation, {len(self.test_imgs)} test examples")
			
--- a/PyTorch/Segmentation/nnUNet/main.py
+++ b/PyTorch/Segmentation/nnUNet/main.py
@@ -14,9 +14,11 @@
 
				 
			
 
				 import os
			
 
				 
			
 
				+import torch
			
 
				 from pytorch_lightning import Trainer, seed_everything
			
 
				 from pytorch_lightning.callbacks import ModelCheckpoint, ModelSummary, RichProgressBar
			
 
				-from pytorch_lightning.loggers import TensorBoardLogger
			
 
				+from pytorch_lightning.plugins.io import AsyncCheckpointIO
			
 
				+from pytorch_lightning.strategies import DDPStrategy
			
 
				 
			
 
				 from data_loading.data_module import DataModule
			
 
				 from nnunet.nn_unet import NNUnet
			
@@ -24,9 +26,40 @@ from utils.args import get_main_args
 
				 from utils.logger import LoggingCallback
			
 
				 from utils.utils import make_empty_dir, set_cuda_devices, set_granularity, verify_ckpt_path
			
 
				 
			
 
				-if __name__ == "__main__":
			
 
				+torch.backends.cuda.matmul.allow_tf32 = True
			
 
				+torch.backends.cudnn.allow_tf32 = True
			
 
				+
			
 
				+
			
 
				+def get_trainer(args, callbacks):
			
 
				+    return Trainer(
			
 
				+        logger=False,
			
 
				+        default_root_dir=args.results,
			
 
				+        benchmark=True,
			
 
				+        deterministic=False,
			
 
				+        max_epochs=args.epochs,
			
 
				+        precision=16 if args.amp else 32,
			
 
				+        gradient_clip_val=args.gradient_clip_val,
			
 
				+        enable_checkpointing=args.save_ckpt,
			
 
				+        callbacks=callbacks,
			
 
				+        num_sanity_val_steps=0,
			
 
				+        accelerator="gpu",
			
 
				+        devices=args.gpus,
			
 
				+        num_nodes=args.nodes,
			
 
				+        plugins=[AsyncCheckpointIO()],
			
 
				+        strategy=DDPStrategy(
			
 
				+            find_unused_parameters=False,
			
 
				+            static_graph=True,
			
 
				+            gradient_as_bucket_view=True,
			
 
				+        ),
			
 
				+        limit_train_batches=1.0 if args.train_batches == 0 else args.train_batches,
			
 
				+        limit_val_batches=1.0 if args.test_batches == 0 else args.test_batches,
			
 
				+        limit_test_batches=1.0 if args.test_batches == 0 else args.test_batches,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				     args = get_main_args()
			
 
				-    set_granularity()  # Increase maximum fetch granularity of L2 to 128 bytes
			
 
				+    set_granularity()
			
 
				     set_cuda_devices(args)
			
 
				     if args.seed is not None:
			
 
				         seed_everything(args.seed)
			
@@ -34,9 +67,11 @@ if __name__ == "__main__":
 
				     data_module.setup()
			
 
				     ckpt_path = verify_ckpt_path(args)
			
 
				 
			
 
				-    model = NNUnet(args)
			
 
				+    if ckpt_path is not None:
			
 
				+        model = NNUnet.load_from_checkpoint(ckpt_path, strict=False, args=args)
			
 
				+    else:
			
 
				+        model = NNUnet(args)
			
 
				     callbacks = [RichProgressBar(), ModelSummary(max_depth=2)]
			
 
				-    logger = False
			
 
				     if args.benchmark:
			
 
				         batch_size = args.batch_size if args.exec_mode == "train" else args.val_batch_size
			
 
				         filnename = args.logname if args.logname is not None else "perf.json"
			
@@ -51,13 +86,6 @@ if __name__ == "__main__":
 
				             )
			
 
				         )
			
 
				     elif args.exec_mode == "train":
			
 
				-        if args.tb_logs:
			
 
				-            logger = TensorBoardLogger(
			
 
				-                save_dir=f"{args.results}/tb_logs",
			
 
				-                name=f"task={args.task}_dim={args.dim}_fold={args.fold}_precision={16 if args.amp else 32}",
			
 
				-                default_hp_metric=False,
			
 
				-                version=0,
			
 
				-            )
			
 
				         if args.save_ckpt:
			
 
				             callbacks.append(
			
 
				                 ModelCheckpoint(
			
@@ -69,26 +97,7 @@ if __name__ == "__main__":
 
				                 )
			
 
				             )
			
 
				 
			
 
				-    trainer = Trainer(
			
 
				-        logger=logger,
			
 
				-        default_root_dir=args.results,
			
 
				-        benchmark=True,
			
 
				-        deterministic=False,
			
 
				-        max_epochs=args.epochs,
			
 
				-        precision=16 if args.amp else 32,
			
 
				-        gradient_clip_val=args.gradient_clip_val,
			
 
				-        enable_checkpointing=args.save_ckpt,
			
 
				-        callbacks=callbacks,
			
 
				-        num_sanity_val_steps=0,
			
 
				-        accelerator="gpu",
			
 
				-        devices=args.gpus,
			
 
				-        num_nodes=args.nodes,
			
 
				-        strategy="ddp" if args.gpus > 1 else None,
			
 
				-        limit_train_batches=1.0 if args.train_batches == 0 else args.train_batches,
			
 
				-        limit_val_batches=1.0 if args.test_batches == 0 else args.test_batches,
			
 
				-        limit_test_batches=1.0 if args.test_batches == 0 else args.test_batches,
			
 
				-    )
			
 
				-
			
 
				+    trainer = get_trainer(args, callbacks)
			
 
				     if args.benchmark:
			
 
				         if args.exec_mode == "train":
			
 
				             trainer.fit(model, train_dataloaders=data_module.train_dataloader())
			
@@ -99,7 +108,7 @@ if __name__ == "__main__":
 
				             model.start_benchmark = 1
			
 
				             trainer.test(model, dataloaders=data_module.test_dataloader(), verbose=False)
			
 
				     elif args.exec_mode == "train":
			
 
				-        trainer.fit(model, datamodule=data_module, ckpt_path=ckpt_path)
			
 
				+        trainer.fit(model, datamodule=data_module)
			
 
				     elif args.exec_mode == "evaluate":
			
 
				         trainer.validate(model, dataloaders=data_module.val_dataloader())
			
 
				     elif args.exec_mode == "predict":
			
@@ -113,4 +122,8 @@ if __name__ == "__main__":
 
				             model.save_dir = save_dir
			
 
				             make_empty_dir(save_dir)
			
 
				         model.args = args
			
 
				-        trainer.test(model, dataloaders=data_module.test_dataloader(), ckpt_path=ckpt_path)
			
 
				+        trainer.test(model, dataloaders=data_module.test_dataloader())
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/PyTorch/Segmentation/nnUNet/nnunet/metrics.py
+++ b/PyTorch/Segmentation/nnUNet/nnunet/metrics.py
@@ -13,45 +13,63 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 import torch
			
 
				-from monai.metrics import compute_meandice, do_metric_reduction
			
 
				-from monai.networks.utils import one_hot
			
 
				 from torchmetrics import Metric
			
 
				 
			
 
				 
			
 
				 class Dice(Metric):
			
 
				+    full_state_update = False
			
 
				+
			
 
				     def __init__(self, n_class, brats):
			
 
				         super().__init__(dist_sync_on_step=False)
			
 
				         self.n_class = n_class
			
 
				         self.brats = brats
			
 
				-        self.add_state("loss", default=torch.zeros(1), dist_reduce_fx="sum")
			
 
				         self.add_state("steps", default=torch.zeros(1), dist_reduce_fx="sum")
			
 
				         self.add_state("dice", default=torch.zeros((n_class,)), dist_reduce_fx="sum")
			
 
				+        self.add_state("loss", default=torch.zeros(1), dist_reduce_fx="sum")
			
 
				 
			
 
				     def update(self, p, y, l):
			
 
				-        if self.brats:
			
 
				-            p = (torch.sigmoid(p) > 0.5).int()
			
 
				-            y_wt, y_tc, y_et = y > 0, ((y == 1) + (y == 3)) > 0, y == 3
			
 
				-            y = torch.stack([y_wt, y_tc, y_et], dim=1)
			
 
				-        else:
			
 
				-            p, y = self.ohe(torch.argmax(p, dim=1)), self.ohe(y)
			
 
				-
			
 
				         self.steps += 1
			
 
				+        self.dice += self.compute_stats_brats(p, y) if self.brats else self.compute_stats(p, y)
			
 
				         self.loss += l
			
 
				-        self.dice += self.compute_metric(p, y, compute_meandice, 1, 0)
			
 
				 
			
 
				     def compute(self):
			
 
				         return 100 * self.dice / self.steps, self.loss / self.steps
			
 
				 
			
 
				-    def ohe(self, x):
			
 
				-        return one_hot(x.unsqueeze(1), num_classes=self.n_class + 1, dim=1)
			
 
				-
			
 
				-    def compute_metric(self, p, y, metric_fn, best_metric, worst_metric):
			
 
				-        metric = metric_fn(p, y, include_background=self.brats)
			
 
				-        metric = torch.nan_to_num(metric, nan=worst_metric, posinf=worst_metric, neginf=worst_metric)
			
 
				-        metric = do_metric_reduction(metric, "mean_batch")[0]
			
 
				+    def compute_stats_brats(self, p, y):
			
 
				+        scores = torch.zeros(self.n_class, device=p.device, dtype=torch.float32)
			
 
				+        p = (torch.sigmoid(p) > 0.5).int()
			
 
				+        y_wt, y_tc, y_et = y > 0, ((y == 1) + (y == 3)) > 0, y == 3
			
 
				+        y = torch.stack([y_wt, y_tc, y_et], dim=1)
			
 
				 
			
 
				         for i in range(self.n_class):
			
 
				-            if (y[:, i] != 1).all():
			
 
				-                metric[i - 1] += best_metric if (p[:, i] != 1).all() else worst_metric
			
 
				+            p_i, y_i = p[:, i], y[:, i]
			
 
				+            if (y_i != 1).all():
			
 
				+                # no foreground class
			
 
				+                scores[i - 1] += 1 if (p_i != 1).all() else 0
			
 
				+                continue
			
 
				+            tp, fn, fp = self.get_stats(p_i, y_i, 1)
			
 
				+            denom = (2 * tp + fp + fn).to(torch.float)
			
 
				+            score_cls = (2 * tp).to(torch.float) / denom if torch.is_nonzero(denom) else 0.0
			
 
				+            scores[i - 1] += score_cls
			
 
				+        return scores
			
 
				+
			
 
				+    def compute_stats(self, p, y):
			
 
				+        scores = torch.zeros(self.n_class, device=p.device, dtype=torch.float32)
			
 
				+        p = torch.argmax(p, dim=1)
			
 
				+        for i in range(1, self.n_class + 1):
			
 
				+            if (y != i).all():
			
 
				+                # no foreground class
			
 
				+                scores[i - 1] += 1 if (p != i).all() else 0
			
 
				+                continue
			
 
				+            tp, fn, fp = self.get_stats(p, y, i)
			
 
				+            denom = (2 * tp + fp + fn).to(torch.float)
			
 
				+            score_cls = (2 * tp).to(torch.float) / denom if torch.is_nonzero(denom) else 0.0
			
 
				+            scores[i - 1] += score_cls
			
 
				+        return scores
			
 
				 
			
 
				-        return metric
			
 
				+    @staticmethod
			
 
				+    def get_stats(p, y, c):
			
 
				+        tp = torch.logical_and(p == c, y == c).sum()
			
 
				+        fn = torch.logical_and(p != c, y == c).sum()
			
 
				+        fp = torch.logical_and(p == c, y != c).sum()
			
 
				+        return tp, fn, fp
			
--- a/PyTorch/Segmentation/nnUNet/nnunet/nn_unet.py
+++ b/PyTorch/Segmentation/nnUNet/nnunet/nn_unet.py
@@ -22,7 +22,6 @@ from apex.optimizers import FusedAdam, FusedSGD
 
				 from data_loading.data_module import get_data_path, get_test_fnames
			
 
				 from monai.inferers import sliding_window_inference
			
 
				 from monai.networks.nets import DynUNet
			
 
				-from monai.optimizers.lr_scheduler import WarmupCosineSchedule
			
 
				 from pytorch_lightning.utilities import rank_zero_only
			
 
				 from scipy.special import expit, softmax
			
 
				 from skimage.transform import resize
			
@@ -87,9 +86,8 @@ class NNUnet(pl.LightningModule):
 
				         return self.loss(preds, label)
			
 
				 
			
 
				     def training_step(self, batch, batch_idx):
			
 
				-        if batch_idx == 0:
			
 
				-            self.train_loss = []
			
 
				         img, lbl = self.get_train_data(batch)
			
 
				+        img, lbl = self.convert_data(img, lbl)
			
 
				         pred = self.model(img)
			
 
				         loss = self.compute_loss(pred, lbl)
			
 
				         self.train_loss.append(loss.item())
			
@@ -99,6 +97,7 @@ class NNUnet(pl.LightningModule):
 
				         if self.current_epoch < self.args.skip_first_n_eval:
			
 
				             return None
			
 
				         img, lbl = batch["image"], batch["label"]
			
 
				+        img, lbl = self.convert_data(img, lbl)
			
 
				         pred = self._forward(img)
			
 
				         loss = self.loss(pred, lbl)
			
 
				         if self.args.invert_resampled_y:
			
@@ -110,6 +109,11 @@ class NNUnet(pl.LightningModule):
 
				         if self.args.exec_mode == "evaluate":
			
 
				             return self.validation_step(batch, batch_idx)
			
 
				         img = batch["image"]
			
 
				+        img = self.convert_ncdhw_to_ndhwc(img)
			
 
				+        if self.args.benchmark:
			
 
				+            pred = self._forward(img)
			
 
				+            return
			
 
				+
			
 
				         pred = self._forward(img).squeeze(0).cpu().detach().numpy()
			
 
				         if self.args.save_preds:
			
 
				             meta = batch["meta"][0].cpu().detach().numpy()
			
@@ -155,8 +159,22 @@ class NNUnet(pl.LightningModule):
 
				         kernels.append(len(spacings) * [3])
			
 
				         return config["in_channels"], config["n_class"], kernels, strides, patch_size
			
 
				 
			
 
				+    def convert_ncdhw_to_ndhwc(self, tensor):
			
 
				+        if self.args.layout == "NCDHW":
			
 
				+            return tensor
			
 
				+        strides = tensor.stride()
			
 
				+        shape = tensor.shape
			
 
				+        tensor = torch.as_strided(
			
 
				+            tensor, (shape[0], shape[-1], *shape[1:-1]), (strides[0], strides[-1], *strides[1:-1])
			
 
				+        )
			
 
				+        return tensor
			
 
				+
			
 
				+    def convert_data(self, img, lbl):
			
 
				+        img, lbl = self.convert_ncdhw_to_ndhwc(img), self.convert_ncdhw_to_ndhwc(lbl)
			
 
				+        return img, lbl
			
 
				+
			
 
				     def build_nnunet(self):
			
 
				-        in_channels, out_channels, kernels, strides, self.patch_size = self.get_unet_params()
			
 
				+        self.in_channels, out_channels, kernels, strides, self.patch_size = self.get_unet_params()
			
 
				         self.n_class = out_channels - 1
			
 
				         if self.args.brats:
			
 
				             out_channels = 3
			
@@ -166,19 +184,21 @@ class NNUnet(pl.LightningModule):
 
				         else:
			
 
				             self.model = DynUNet(
			
 
				                 self.args.dim,
			
 
				-                in_channels,
			
 
				+                self.in_channels,
			
 
				                 out_channels,
			
 
				                 kernels,
			
 
				                 strides,
			
 
				                 strides[1:],
			
 
				                 filters=self.args.filters,
			
 
				-                norm_name=("INSTANCE", {"affine": True}),
			
 
				-                act_name=("leakyrelu", {"inplace": True, "negative_slope": 0.01}),
			
 
				+                norm_name=(self.args.norm.upper(), {"affine": True}),
			
 
				+                act_name=("leakyrelu", {"inplace": False, "negative_slope": 0.01}),
			
 
				                 deep_supervision=self.args.deep_supervision,
			
 
				                 deep_supr_num=self.args.deep_supr_num,
			
 
				                 res_block=self.args.res_block,
			
 
				                 trans_bias=True,
			
 
				             )
			
 
				+        if self.args.layout == "NDHWC" and self.args.dim == 3:
			
 
				+            self.model.to(memory_format=torch.channels_last_3d)
			
 
				         print0(f"Filters: {self.model.filters},\nKernels: {kernels}\nStrides: {strides}")
			
 
				 
			
 
				     def do_inference(self, image):
			
@@ -225,10 +245,9 @@ class NNUnet(pl.LightningModule):
 
				 
			
 
				     def validation_epoch_end(self, outputs):
			
 
				         if self.current_epoch < self.args.skip_first_n_eval:
			
 
				-            self.log("dice", 0.0)
			
 
				+            self.log("dice", 0.0, sync_dist=False)
			
 
				             self.dice.reset()
			
 
				             return None
			
 
				-
			
 
				         dice, loss = self.dice.compute()
			
 
				         self.dice.reset()
			
 
				 
			
@@ -244,15 +263,15 @@ class NNUnet(pl.LightningModule):
 
				         metrics["Val Loss"] = self.round(loss)
			
 
				         metrics["Max Dice"] = self.round(self.best_mean_dice)
			
 
				         metrics["Best epoch"] = self.best_epoch
			
 
				-        metrics["Train Loss"] = round(sum(self.train_loss) / len(self.train_loss), 4)
			
 
				+        metrics["Train Loss"] = (
			
 
				+            0 if len(self.train_loss) == 0 else round(sum(self.train_loss) / len(self.train_loss), 4)
			
 
				+        )
			
 
				         if self.n_class > 1:
			
 
				             metrics.update({f"D{i+1}": self.round(m) for i, m in enumerate(dice)})
			
 
				 
			
 
				         self.dllogger.log_metrics(step=self.current_epoch, metrics=metrics)
			
 
				         self.dllogger.flush()
			
 
				-        if self.args.tb_logs:
			
 
				-            self.logger.log_metrics(metrics, step=self.current_epoch)
			
 
				-        self.log("dice", metrics["Dice"])
			
 
				+        self.log("dice", metrics["Dice"], sync_dist=False)
			
 
				 
			
 
				     def test_epoch_end(self, outputs):
			
 
				         if self.args.exec_mode == "evaluate":
			
@@ -260,7 +279,7 @@ class NNUnet(pl.LightningModule):
 
				 
			
 
				     @rank_zero_only
			
 
				     def on_fit_end(self):
			
 
				-        if not self.args.benchmark:
			
 
				+        if not self.args.benchmark and self.args.skip_first_n_eval == 0:
			
 
				             metrics = {}
			
 
				             metrics["dice_score"] = round(self.best_mean.item(), 2)
			
 
				             metrics["train_loss"] = round(sum(self.train_loss) / len(self.train_loss), 4)
			
@@ -276,15 +295,7 @@ class NNUnet(pl.LightningModule):
 
				         }[self.args.optimizer.lower()]
			
 
				 
			
 
				         if self.args.scheduler:
			
 
				-            scheduler = {
			
 
				-                "scheduler": WarmupCosineSchedule(
			
 
				-                    optimizer=optimizer,
			
 
				-                    warmup_steps=250,
			
 
				-                    t_total=self.args.epochs * len(self.trainer.datamodule.train_dataloader()),
			
 
				-                ),
			
 
				-                "interval": "step",
			
 
				-                "frequency": 1,
			
 
				-            }
			
 
				+            scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 4096, eta_min=8e-5)
			
 
				             return {"optimizer": optimizer, "monitor": "val_loss", "lr_scheduler": scheduler}
			
 
				         return {"optimizer": optimizer, "monitor": "val_loss"}
			
 
				 
			
@@ -304,10 +315,10 @@ class NNUnet(pl.LightningModule):
 
				 
			
 
				 
			
 
				 def layout_2d(img, lbl):
			
 
				-    batch_size, depth, channels, height, weight = img.shape
			
 
				-    img = torch.reshape(img, (batch_size * depth, channels, height, weight))
			
 
				+    batch_size, depth, channels, height, width = img.shape
			
 
				+    img = torch.reshape(img, (batch_size * depth, channels, height, width))
			
 
				     if lbl is not None:
			
 
				-        lbl = torch.reshape(lbl, (batch_size * depth, 1, height, weight))
			
 
				+        lbl = torch.reshape(lbl, (batch_size * depth, 1, height, width))
			
 
				         return img, lbl
			
 
				     return img
			
 
				 
			
--- a/PyTorch/Segmentation/nnUNet/requirements.txt
+++ b/PyTorch/Segmentation/nnUNet/requirements.txt
@@ -1,8 +1,9 @@
 
				 git+https://github.com/NVIDIA/dllogger
			
 
				+git+https://github.com/NVIDIA/mlperf-common.git
			
 
				 nibabel==3.2.1
			
 
				 joblib==1.0.1
			
 
				-pytorch-lightning==1.6.5
			
 
				+pytorch-lightning==1.7.7
			
 
				 scikit-learn==1.0
			
 
				 scikit-image==0.18.3
			
 
				 scipy==1.8.1
			
 
				-rich==12.2.0
			
 
				+rich==12.5.0
			
--- a/PyTorch/Segmentation/nnUNet/scripts/benchmark.py
+++ b/PyTorch/Segmentation/nnUNet/scripts/benchmark.py
@@ -12,10 +12,9 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				-import os
			
 
				+import subprocess
			
 
				 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
			
 
				-from os.path import dirname
			
 
				-from subprocess import run
			
 
				+from pathlib import Path
			
 
				 
			
 
				 parser = ArgumentParser(ArgumentDefaultsHelpFormatter)
			
 
				 parser.add_argument("--mode", type=str, required=True, choices=["train", "predict"], help="Benchmarking mode")
			
@@ -23,18 +22,21 @@ parser.add_argument("--task", type=str, default="01", help="Task code")
 
				 parser.add_argument("--gpus", type=int, default=1, help="Number of GPUs to use")
			
 
				 parser.add_argument("--nodes", type=int, default=1, help="Number of nodes to use")
			
 
				 parser.add_argument("--dim", type=int, required=True, help="Dimension of UNet")
			
 
				-parser.add_argument("--batch_size", type=int, default=2, help="Batch size")
			
 
				+parser.add_argument("--batch_size", type=int, required=True, help="Batch size")
			
 
				 parser.add_argument("--amp", action="store_true", help="Enable automatic mixed precision")
			
 
				-parser.add_argument("--train_batches", type=int, default=150, help="Number of batches for training")
			
 
				-parser.add_argument("--test_batches", type=int, default=150, help="Number of batches for inference")
			
 
				-parser.add_argument("--warmup", type=int, default=50, help="Warmup iterations before collecting statistics")
			
 
				+parser.add_argument("--bind", action="store_true", help="Bind CPUs for each GPU. Improves throughput for multi-GPU.")
			
 
				+parser.add_argument("--train_batches", type=int, default=200, help="Number of batches for training")
			
 
				+parser.add_argument("--test_batches", type=int, default=200, help="Number of batches for inference")
			
 
				+parser.add_argument("--warmup", type=int, default=100, help="Warmup iterations before collecting statistics")
			
 
				 parser.add_argument("--results", type=str, default="/results", help="Path to results directory")
			
 
				 parser.add_argument("--logname", type=str, default="perf.json", help="Name of dlloger output")
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     args = parser.parse_args()
			
 
				-    path_to_main = os.path.join(dirname(dirname(os.path.realpath(__file__))), "main.py")
			
 
				+    path_to_main = Path(__file__).resolve().parent.parent / "main.py"
			
 
				     cmd = ""
			
 
				+    if args.bind:
			
 
				+        cmd += "bindpcie --cpu=exclusive,nosmt "
			
 
				     cmd += f"python main.py --task {args.task} --benchmark --epochs 2 "
			
 
				     cmd += f"--results {args.results} "
			
 
				     cmd += f"--logname {args.logname} "
			
@@ -50,4 +52,6 @@ if __name__ == "__main__":
 
				         cmd += f"--batch_size {args.batch_size} "
			
 
				     else:
			
 
				         cmd += f"--val_batch_size {args.batch_size} "
			
 
				-    run(cmd, shell=True)
			
 
				+    if args.amp and args.dim == 3:
			
 
				+        cmd += "--norm instance_nvfuser --layout NDHWC"
			
 
				+    subprocess.run(cmd, shell=True)
			
--- a/PyTorch/Segmentation/nnUNet/scripts/train.py
+++ b/PyTorch/Segmentation/nnUNet/scripts/train.py
@@ -12,9 +12,8 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				-import os
			
 
				 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
			
 
				-from os.path import dirname
			
 
				+from pathlib import Path
			
 
				 from subprocess import run
			
 
				 
			
 
				 parser = ArgumentParser(ArgumentDefaultsHelpFormatter)
			
@@ -22,30 +21,37 @@ parser.add_argument("--task", type=str, default="01", help="Path to data")
 
				 parser.add_argument("--gpus", type=int, required=True, help="Number of GPUs")
			
 
				 parser.add_argument("--fold", type=int, required=True, choices=[0, 1, 2, 3, 4], help="Fold number")
			
 
				 parser.add_argument("--dim", type=int, required=True, choices=[2, 3], help="Dimension of UNet")
			
 
				+parser.add_argument("--seed", type=int, default=1, help="Random seed")
			
 
				 parser.add_argument("--amp", action="store_true", help="Enable automatic mixed precision")
			
 
				 parser.add_argument("--tta", action="store_true", help="Enable test time augmentation")
			
 
				-parser.add_argument("--deep_supervision", action="store_true", help="Enable deep supervision loss")
			
 
				+parser.add_argument("--bind", action="store_true", help="Enable test time augmentation")
			
 
				 parser.add_argument("--resume_training", action="store_true", help="Resume training from checkpoint")
			
 
				 parser.add_argument("--results", type=str, default="/results", help="Path to results directory")
			
 
				-parser.add_argument("--logname", type=str, default="log", help="Name of dlloger output")
			
 
				-parser.add_argument("--epochs", type=int, default=600, help="Number of epochs to train")
			
 
				+parser.add_argument("--logname", type=str, default="train_logs.json", help="Name of dlloger output")
			
 
				 parser.add_argument("--learning_rate", type=float, default=8e-4, help="Learning rate")
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     args = parser.parse_args()
			
 
				-    path_to_main = os.path.join(dirname(dirname(os.path.realpath(__file__))), "main.py")
			
 
				-    cmd = f"python {path_to_main} --exec_mode train --task {args.task} --save_ckpt "
			
 
				+    skip = 100 if args.gpus == 1 else 150
			
 
				+    path_to_main = Path(__file__).resolve().parent.parent / "main.py"
			
 
				+    cmd = ""
			
 
				+    if args.bind:
			
 
				+        cmd += "bindpcie --cpu=exclusive,nosmt "
			
 
				+    cmd = f"python {path_to_main} --exec_mode train --save_ckpt --deep_supervision --skip_first_n_eval {skip} "
			
 
				+    cmd += f"--task {args.task} "
			
 
				     cmd += f"--results {args.results} "
			
 
				     cmd += f"--logname {args.logname} "
			
 
				     cmd += f"--dim {args.dim} "
			
 
				     cmd += f"--batch_size {2 if args.dim == 3 else 64} "
			
 
				-    cmd += f"--val_batch_size {4 if args.dim == 3 else 64} "
			
 
				+    cmd += f"--val_batch_size {1 if args.dim == 3 else 64} "
			
 
				+    cmd += f"--norm {'instance_nvfuser' if args.dim == 3 else 'instance'} "
			
 
				+    cmd += f"--layout {'NDHWC' if args.dim == 3 else 'NCDHW'} "
			
 
				     cmd += f"--fold {args.fold} "
			
 
				     cmd += f"--gpus {args.gpus} "
			
 
				-    cmd += f"--epochs {args.epochs} "
			
 
				+    cmd += f"--epochs {300 if args.gpus == 1 else 600} "
			
 
				     cmd += f"--learning_rate {args.learning_rate} "
			
 
				     cmd += "--amp " if args.amp else ""
			
 
				     cmd += "--tta " if args.tta else ""
			
 
				     cmd += "--resume_training " if args.resume_training else ""
			
 
				-    cmd += "--deep_supervision " if args.deep_supervision else ""
			
 
				+    cmd += f"--seed {args.seed} "
			
 
				     run(cmd, shell=True)
			
--- a/PyTorch/Segmentation/nnUNet/utils/args.py
+++ b/PyTorch/Segmentation/nnUNet/utils/args.py
@@ -55,11 +55,8 @@ def get_main_args(strings=None):
 
				     arg("--gradient_clip_val", type=float, default=0, help="Gradient clipping norm value")
			
 
				     arg("--negative_slope", type=float, default=0.01, help="Negative slope for LeakyReLU")
			
 
				     arg("--tta", action="store_true", help="Enable test time augmentation")
			
 
				-    arg("--tb_logs", action="store_true", help="Log metrics to tensoboard")
			
 
				     arg("--brats", action="store_true", help="Enable BraTS specific training and inference")
			
 
				-    arg("--brats22_model", action="store_true", help="Use BraTS22 model")
			
 
				     arg("--deep_supervision", action="store_true", help="Enable deep supervision")
			
 
				-    arg("--more_chn", action="store_true", help="Create encoder with more channels")
			
 
				     arg("--invert_resampled_y", action="store_true", help="Resize predictions to match label size before resampling")
			
 
				     arg("--amp", action="store_true", help="Enable automatic mixed precision")
			
 
				     arg("--benchmark", action="store_true", help="Run model benchmarking")
			
@@ -82,13 +79,21 @@ def get_main_args(strings=None):
 
				     arg("--num_workers", type=non_negative_int, default=8, help="Number of subprocesses to use for data loading")
			
 
				     arg("--epochs", type=non_negative_int, default=1000, help="Number of training epochs.")
			
 
				     arg("--warmup", type=non_negative_int, default=5, help="Warmup iterations before collecting statistics")
			
 
				-    arg("--norm", type=str, choices=["instance", "batch", "group"], default="instance", help="Normalization layer")
			
 
				     arg("--nvol", type=positive_int, default=4, help="Number of volumes which come into single batch size for 2D model")
			
 
				     arg("--depth", type=non_negative_int, default=5, help="The depth of the encoder")
			
 
				     arg("--min_fmap", type=non_negative_int, default=4, help="Minimal dimension of feature map in the bottleneck")
			
 
				     arg("--deep_supr_num", type=non_negative_int, default=2, help="Number of deep supervision heads")
			
 
				     arg("--res_block", action="store_true", help="Enable residual blocks")
			
 
				     arg("--filters", nargs="+", help="[Optional] Set U-Net filters", default=None, type=int)
			
 
				+    arg("--layout", type=str, default="NCDHW")
			
 
				+    arg("--brats22_model", action="store_true", help="Use BraTS22 model")
			
 
				+    arg(
			
 
				+        "--norm",
			
 
				+        type=str,
			
 
				+        choices=["instance", "instance_nvfuser", "batch", "group"],
			
 
				+        default="instance",
			
 
				+        help="Normalization layer",
			
 
				+    )
			
 
				     arg(
			
 
				         "--data2d_dim",
			
 
				         choices=[2, 3],
			
@@ -105,7 +110,7 @@ def get_main_args(strings=None):
 
				     arg(
			
 
				         "--overlap",
			
 
				         type=float_0_1,
			
 
				-        default=0.5,
			
 
				+        default=0.25,
			
 
				         help="Amount of overlap between scans during sliding window inference",
			
 
				     )
			
 
				     arg(
			
@@ -124,7 +129,7 @@ def get_main_args(strings=None):
 
				         "--blend",
			
 
				         type=str,
			
 
				         choices=["gaussian", "constant"],
			
 
				-        default="gaussian",
			
 
				+        default="constant",
			
 
				         help="How to blend output of overlapping windows",
			
 
				     )
			
 
				     arg(
			
--- a/PyTorch/Segmentation/nnUNet/utils/instance_norm.py
+++ b/PyTorch/Segmentation/nnUNet/utils/instance_norm.py
@@ -0,0 +1,168 @@
 
				+import importlib
			
 
				+
			
 
				+import torch
			
 
				+from torch import Tensor
			
 
				+from torch.nn.modules.batchnorm import _NormBase
			
 
				+
			
 
				+global instance_norm_nvfuser_cuda
			
 
				+instance_norm_nvfuser_cuda = None
			
 
				+
			
 
				+
			
 
				+class InstanceNormNVFuserFunction(torch.autograd.Function):
			
 
				+    @staticmethod
			
 
				+    def forward(ctx, input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps):
			
 
				+        global instance_norm_nvfuser_cuda
			
 
				+        if instance_norm_nvfuser_cuda is None:
			
 
				+            instance_norm_nvfuser_cuda = importlib.import_module("instance_norm_nvfuser_cuda")
			
 
				+
			
 
				+        channels_last = input.is_contiguous(memory_format=torch.channels_last) or input.is_contiguous(
			
 
				+            memory_format=torch.channels_last_3d
			
 
				+        )
			
 
				+        if channels_last:
			
 
				+            order = [0] + [i for i in range(2, len(input.shape))] + [1]
			
 
				+            _input = input.permute(order)
			
 
				+        else:
			
 
				+            _input = input
			
 
				+        assert _input.is_contiguous()
			
 
				+        result = instance_norm_nvfuser_cuda.forward(
			
 
				+            _input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps, channels_last
			
 
				+        )
			
 
				+        if len(result) == 3:
			
 
				+            out, mean, invstd = result
			
 
				+        else:
			
 
				+            running_mean, running_var, out, mean, invstd = result
			
 
				+        ctx.use_input_stats = use_input_stats
			
 
				+        ctx.eps = eps
			
 
				+        ctx.channels_last = channels_last
			
 
				+        # saving for backward in "explicit channels-last format"
			
 
				+        ctx.save_for_backward(_input, weight, running_mean, running_var, mean, invstd)
			
 
				+        if channels_last:
			
 
				+            order = [0, len(_input.shape) - 1] + [i for i in range(1, len(_input.shape) - 1)]
			
 
				+            out = out.permute(order)
			
 
				+            if len(out.shape) == 4:
			
 
				+                assert out.is_contiguous(memory_format=torch.channels_last)
			
 
				+                assert input.is_contiguous(memory_format=torch.channels_last)
			
 
				+            elif len(out.shape) == 5:
			
 
				+                assert out.is_contiguous(memory_format=torch.channels_last_3d)
			
 
				+                assert input.is_contiguous(memory_format=torch.channels_last_3d)
			
 
				+            else:
			
 
				+                assert False, "unhandled channels_last format variation in forward"
			
 
				+        return out
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def backward(ctx, grad_output):
			
 
				+        global instance_norm_nvfuser_cuda
			
 
				+        if instance_norm_nvfuser_cuda is None:
			
 
				+            instance_norm_nvfuser_cuda = importlib.import_module("instance_norm_nvfuser_cuda")
			
 
				+
			
 
				+        if ctx.channels_last:
			
 
				+            order = [0] + [i for i in range(2, len(grad_output.shape))] + [1]
			
 
				+            grad_output = grad_output.permute(order)
			
 
				+        # input was saved in "explicit channels-last format"
			
 
				+        assert ctx.saved_tensors[0].is_contiguous()
			
 
				+        grad_output = grad_output.contiguous()
			
 
				+        saved = list(ctx.saved_tensors)
			
 
				+        saved.insert(1, grad_output)
			
 
				+        running_mean = saved[3]
			
 
				+        running_var = saved[4]
			
 
				+        mean = saved[-2]
			
 
				+        var = saved[-1]
			
 
				+        grad_input, grad_weight, grad_bias = instance_norm_nvfuser_cuda.backward(
			
 
				+            *saved, ctx.use_input_stats, ctx.eps, ctx.channels_last
			
 
				+        )
			
 
				+        if ctx.channels_last:
			
 
				+            order = [0, len(grad_input.shape) - 1] + [i for i in range(1, len(grad_input.shape) - 1)]
			
 
				+            grad_input = grad_input.permute(order)
			
 
				+            if len(grad_input.shape) == 4:
			
 
				+                assert grad_input.is_contiguous(memory_format=torch.channels_last)
			
 
				+            elif len(grad_input.shape) == 5:
			
 
				+                assert grad_input.is_contiguous(memory_format=torch.channels_last_3d)
			
 
				+            else:
			
 
				+                assert False, "unhandled channels_last format variation in backward"
			
 
				+        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
			
 
				+
			
 
				+
			
 
				+class _InstanceNormNVFuser(_NormBase):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        num_features: int,
			
 
				+        eps: float = 1e-5,
			
 
				+        momentum: float = 0.1,
			
 
				+        affine: bool = False,
			
 
				+        track_running_stats: bool = False,
			
 
				+        device=None,
			
 
				+        dtype=None,
			
 
				+    ) -> None:
			
 
				+        factory_kwargs = {"device": device, "dtype": dtype}
			
 
				+        super(_InstanceNormNVFuser, self).__init__(
			
 
				+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
			
 
				+        )
			
 
				+        self.dummy = torch.empty([], device=device)
			
 
				+
			
 
				+    def _check_input_dim(self, input):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def _load_from_state_dict(
			
 
				+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
			
 
				+    ):
			
 
				+        version = local_metadata.get("version", None)
			
 
				+        # at version 1: removed running_mean and running_var when
			
 
				+        # track_running_stats=False (default)
			
 
				+        if version is None and not self.track_running_stats:
			
 
				+            running_stats_keys = []
			
 
				+            for name in ("running_mean", "running_var"):
			
 
				+                key = prefix + name
			
 
				+                if key in state_dict:
			
 
				+                    running_stats_keys.append(key)
			
 
				+            if len(running_stats_keys) > 0:
			
 
				+                error_msgs.append(
			
 
				+                    "Unexpected running stats buffer(s) {names} for {klass} "
			
 
				+                    "with track_running_stats=False. If state_dict is a "
			
 
				+                    "checkpoint saved before 0.4.0, this may be expected "
			
 
				+                    "because {klass} does not track running stats by default "
			
 
				+                    "since 0.4.0. Please remove these keys from state_dict. If "
			
 
				+                    "the running stats are actually needed, instead set "
			
 
				+                    "track_running_stats=True in {klass} to enable them. See "
			
 
				+                    "the documentation of {klass} for details.".format(
			
 
				+                        names=" and ".join('"{}"'.format(k) for k in running_stats_keys), klass=self.__class__.__name__
			
 
				+                    )
			
 
				+                )
			
 
				+                for key in running_stats_keys:
			
 
				+                    state_dict.pop(key)
			
 
				+
			
 
				+        super(_InstanceNormNVFuser, self)._load_from_state_dict(
			
 
				+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, input: Tensor) -> Tensor:
			
 
				+        assert input.is_cuda, "NVFuser InstanceNorm is CUDA only"
			
 
				+        self._check_input_dim(input)
			
 
				+        if self.running_mean is not None:
			
 
				+            out = InstanceNormNVFuserFunction.apply(
			
 
				+                input,
			
 
				+                self.weight if self.weight is not None else self.dummy,
			
 
				+                self.bias if self.bias is not None else self.dummy,
			
 
				+                self.running_mean,
			
 
				+                self.running_var,
			
 
				+                self.training or not self.track_running_stats,
			
 
				+                self.momentum,
			
 
				+                self.eps,
			
 
				+            )
			
 
				+        else:
			
 
				+            out = InstanceNormNVFuserFunction.apply(
			
 
				+                input,
			
 
				+                self.weight if self.weight is not None else self.dummy,
			
 
				+                self.bias if self.bias is not None else self.dummy,
			
 
				+                self.dummy,
			
 
				+                self.dummy,
			
 
				+                self.training or not self.track_running_stats,
			
 
				+                self.momentum,
			
 
				+                self.eps,
			
 
				+            )
			
 
				+        return out
			
 
				+
			
 
				+
			
 
				+class InstanceNorm3dNVFuser(_InstanceNormNVFuser):
			
 
				+    def _check_input_dim(self, input):
			
 
				+        if input.dim() != 5:
			
 
				+            raise ValueError("expected 5D input (got {}D input)".format(input.dim()))
			
--- a/PyTorch/Segmentation/nnUNet/utils/logger.py
+++ b/PyTorch/Segmentation/nnUNet/utils/logger.py
@@ -60,34 +60,36 @@ class LoggingCallback(Callback):
 
				         self.mode = mode
			
 
				         self.timestamps = []
			
 
				 
			
 
				-        self.dllogger.log_metadata(f"dice_score", {"unit": None})
			
 
				+        self.dllogger.log_metadata("dice_score", {"unit": None})
			
 
				         self.dllogger.log_metadata(f"throughput_{self.mode}", {"unit": "images/s"})
			
 
				         self.dllogger.log_metadata(f"latency_{self.mode}_mean", {"unit": "ms"})
			
 
				         for level in [90, 95, 99]:
			
 
				             self.dllogger.log_metadata(f"latency_{self.mode}_{level}", {"unit": "ms"})
			
 
				 
			
 
				     def do_step(self):
			
 
				-        self.step += 1
			
 
				         if self.step > self.warmup_steps:
			
 
				-            self.timestamps.append(time.time())
			
 
				+            self.step += 1
			
 
				+            return
			
 
				+        self.timestamps.append(time.perf_counter())
			
 
				 
			
 
				-    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
			
 
				+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
			
 
				         if trainer.current_epoch == 1:
			
 
				             self.do_step()
			
 
				 
			
 
				-    def on_test_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
			
 
				+    def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
			
 
				         if pl_module.start_benchmark == 1:
			
 
				             self.do_step()
			
 
				 
			
 
				-    def process_performance_stats(self, deltas):
			
 
				+    def process_performance_stats(self):
			
 
				         def _round3(val):
			
 
				             return round(val, 3)
			
 
				 
			
 
				-        throughput_imgps = _round3(self.global_batch_size / np.mean(deltas))
			
 
				-        timestamps_ms = 1000 * deltas
			
 
				+        elapsed_times = np.diff(self.timestamps)
			
 
				+        throughput_imgps = _round3(self.global_batch_size / np.mean(elapsed_times))
			
 
				+        timestamps_ms = 1000 * elapsed_times
			
 
				         stats = {
			
 
				             f"throughput_{self.mode}": throughput_imgps,
			
 
				-            f"latency_{self.mode}_mean": _round3(timestamps_ms.mean()),
			
 
				+            f"latency_{self.mode}_mean": _round3(np.mean(timestamps_ms)),
			
 
				         }
			
 
				         for level in [90, 95, 99]:
			
 
				             stats.update({f"latency_{self.mode}_{level}": _round3(np.percentile(timestamps_ms, level))})
			
@@ -96,7 +98,7 @@ class LoggingCallback(Callback):
 
				 
			
 
				     @rank_zero_only
			
 
				     def _log(self):
			
 
				-        stats = self.process_performance_stats(np.diff(self.timestamps))
			
 
				+        stats = self.process_performance_stats()
			
 
				         self.dllogger.log_metrics(metrics=stats)
			
 
				         self.dllogger.flush()
			
 
				 
			
--- a/PyTorch/Segmentation/nnUNet/utils/utils.py
+++ b/PyTorch/Segmentation/nnUNet/utils/utils.py
@@ -58,6 +58,9 @@ def verify_ckpt_path(args):
 
				             return resume_path_results
			
 
				         print("[Warning] Checkpoint not found. Starting training from scratch.")
			
 
				         return None
			
 
				+    if not os.path.isfile(args.ckpt_path):
			
 
				+        print(f"Provided checkpoint {args.ckpt_path} is not a file. Starting training from scratch.")
			
 
				+        return None
			
 
				     return args.ckpt_path