5 лет назад · 94a8f28554
--- a/TensorFlow2/Segmentation/UNet_Medical/README.md
+++ b/TensorFlow2/Segmentation/UNet_Medical/README.md
@@ -231,20 +231,20 @@ For the specifics concerning training and inference, see the [Advanced](#advance
 
				   
			
 
				    This script will launch a training on a single fold and store the model’s checkpoint in the <path/to/checkpoint> directory. 
			
 
				   
			
 
				-   The script can be run directly by modifying flags if necessary, especially the number of GPUs, which is defined after the `-np` flag. Since the test volume does not have labels, 20% of the training data is used for validation in 5-fold cross-validation manner. The number of fold can be changed using `--crossvalidation_idx` with an integer in range 0-4. For example, to run with 4 GPUs using fold 1 use:
			
 
				+   The script can be run directly by modifying flags if necessary, especially the number of GPUs, which is defined after the `-np` flag. Since the test volume does not have labels, 20% of the training data is used for validation in 5-fold cross-validation manner. The number of fold can be changed using `--fold` with an integer in range 0-4. For example, to run with 4 GPUs using fold 1 use:
			
 
				   
			
 
				    ```bash
			
 
				-   horovodrun -np 4 python main.py --data_dir /data --model_dir /results --batch_size 1 --exec_mode train --crossvalidation_idx 1 --xla --amp
			
 
				+   horovodrun -np 4 python main.py --data_dir /data --model_dir /results --batch_size 1 --exec_mode train --fold 1 --xla --amp
			
 
				    ```
			
 
				   
			
 
				    Training will result in a checkpoint file being written to `./results` on the host machine.
			
 
				  
			
 
				 6. Start validation/evaluation.
			
 
				   
			
 
				-   The trained model can be evaluated by passing the `--exec_mode evaluate` flag. Since evaluation is carried out on a validation dataset, the `--crossvalidation_idx` parameter should be filled. For example:
			
 
				+   The trained model can be evaluated by passing the `--exec_mode evaluate` flag. Since evaluation is carried out on a validation dataset, the `--fold` parameter should be filled. For example:
			
 
				   
			
 
				    ```bash
			
 
				-   python main.py --data_dir /data --model_dir /results --batch_size 1 --exec_mode evaluate --crossvalidation_idx 0 --xla --amp
			
 
				+   python main.py --data_dir /data --model_dir /results --batch_size 1 --exec_mode evaluate --fold 0 --xla --amp
			
 
				    ```
			
 
				   
			
 
				    Evaluation can also be triggered jointly after training by passing the `--exec_mode train_and_evaluate` flag.
			
@@ -291,19 +291,20 @@ Other folders included in the root directory are:
 
				 The complete list of the available parameters for the `main.py` script contains:
			
 
				 * `--exec_mode`: Select the execution mode to run the model (default: `train`). Modes available:
			
 
				   * `train` - trains model from scratch.
			
 
				-  * `evaluate` - loads checkpoint (if available) and performs evaluation on validation subset (requires `--crossvalidation_idx` other than `None`).
			
 
				-  * `train_and_evaluate` - trains model from scratch and performs validation at the end (requires `--crossvalidation_idx` other than `None`).
			
 
				+  * `evaluate` - loads checkpoint (if available) and performs evaluation on validation subset (requires `--fold` other than `None`).
			
 
				+  * `train_and_evaluate` - trains model from scratch and performs validation at the end (requires `--fold` other than `None`).
			
 
				   * `predict` - loads checkpoint (if available) and runs inference on the test set. Stores the results in `--model_dir` directory.
			
 
				   * `train_and_predict` - trains model from scratch and performs inference.
			
 
				 * `--model_dir`: Set the output directory for information related to the model (default: `/results`).
			
 
				 * `--log_dir`: Set the output directory for logs (default: None).
			
 
				 * `--data_dir`: Set the input directory containing the dataset (default: `None`).
			
 
				 * `--batch_size`: Size of each minibatch per GPU (default: `1`).
			
 
				-* `--crossvalidation_idx`: Selected fold for cross-validation (default: `None`).
			
 
				+* `--fold`: Selected fold for cross-validation (default: `None`).
			
 
				 * `--max_steps`: Maximum number of steps (batches) for training (default: `1000`).
			
 
				 * `--seed`: Set random seed for reproducibility (default: `0`).
			
 
				 * `--weight_decay`: Weight decay coefficient (default: `0.0005`).
			
 
				 * `--log_every`: Log performance every n steps (default: `100`).
			
 
				+* `--evaluate_every`: Evaluate every n steps (default: `0` - evaluate once at the end).
			
 
				 * `--learning_rate`: Model’s learning rate (default: `0.0001`).
			
 
				 * `--augment`: Enable data augmentation (default: `False`).
			
 
				 * `--benchmark`: Enable performance benchmarking (default: `False`). If the flag is set, the script runs in a benchmark mode - each iteration is timed and the performance result (in images per second) is printed at the end. Works for both `train` and `predict` execution modes.
			
@@ -324,8 +325,8 @@ usage: main.py [-h]
 
				               [--exec_mode {train,train_and_predict,predict,evaluate,train_and_evaluate}]
			
 
				               [--model_dir MODEL_DIR] --data_dir DATA_DIR [--log_dir LOG_DIR]
			
 
				               [--batch_size BATCH_SIZE] [--learning_rate LEARNING_RATE]
			
 
				-              [--crossvalidation_idx CROSSVALIDATION_IDX]
			
 
				-              [--max_steps MAX_STEPS] [--weight_decay WEIGHT_DECAY]
			
 
				+              [--fold FOLD] [--max_steps MAX_STEPS]
			
 
				+              [--evaluate_every EVALUATE_EVERY] [--weight_decay WEIGHT_DECAY]
			
 
				               [--log_every LOG_EVERY] [--warmup_steps WARMUP_STEPS]
			
 
				               [--seed SEED] [--augment] [--benchmark]
			
 
				               [--amp] [--xla]
			
@@ -333,34 +334,39 @@ usage: main.py [-h]
 
				 UNet-medical
			
 
				  
			
 
				 optional arguments:
			
 
				- -h, --help            show this help message and exit
			
 
				- --exec_mode {train,train_and_predict,predict,evaluate,train_and_evaluate}
			
 
				-                       Execution mode of running the model
			
 
				- --model_dir MODEL_DIR
			
 
				-                       Output directory for information related to the model
			
 
				- --data_dir DATA_DIR   Input directory containing the dataset for training
			
 
				-                       the model
			
 
				- --log_dir LOG_DIR     Output directory for training logs
			
 
				- --batch_size BATCH_SIZE
			
 
				-                       Size of each minibatch per GPU
			
 
				- --learning_rate LEARNING_RATE
			
 
				-                       Learning rate coefficient for AdamOptimizer
			
 
				- --crossvalidation_idx CROSSVALIDATION_IDX
			
 
				-                       Chosen fold for cross-validation. Use None to disable
			
 
				-                       cross-validation
			
 
				- --max_steps MAX_STEPS
			
 
				-                       Maximum number of steps (batches) used for training
			
 
				- --weight_decay WEIGHT_DECAY
			
 
				-                       Weight decay coefficient
			
 
				- --log_every LOG_EVERY
			
 
				-                       Log performance every n steps
			
 
				- --warmup_steps WARMUP_STEPS
			
 
				-                       Number of warmup steps
			
 
				- --seed SEED           Random seed
			
 
				- --augment             Perform data augmentation during training
			
 
				- --benchmark           Collect performance metrics during training
			
 
				- --amp                 Train using TF-AMP
			
 
				- --xla                 Train using XLA
			
 
				+  -h, --help            show this help message and exit
			
 
				+  --exec_mode {train,train_and_predict,predict,evaluate,train_and_evaluate}
			
 
				+                        Execution mode of running the model
			
 
				+  --model_dir MODEL_DIR
			
 
				+                        Output directory for information related to the model
			
 
				+  --data_dir DATA_DIR   Input directory containing the dataset for training
			
 
				+                        the model
			
 
				+  --log_dir LOG_DIR     Output directory for training logs
			
 
				+  --batch_size BATCH_SIZE
			
 
				+                        Size of each minibatch per GPU
			
 
				+  --learning_rate LEARNING_RATE
			
 
				+                        Learning rate coefficient for AdamOptimizer
			
 
				+  --fold FOLD           Chosen fold for cross-validation. Use None to disable
			
 
				+                        cross-validation
			
 
				+  --max_steps MAX_STEPS
			
 
				+                        Maximum number of steps (batches) used for training
			
 
				+  --weight_decay WEIGHT_DECAY
			
 
				+                        Weight decay coefficient
			
 
				+  --log_every LOG_EVERY
			
 
				+                        Log performance every n steps
			
 
				+  --evaluate_every EVALUATE_EVERY
			
 
				+                        Evaluate every n steps
			
 
				+  --warmup_steps WARMUP_STEPS
			
 
				+                        Number of warmup steps
			
 
				+  --seed SEED           Random seed
			
 
				+  --augment             Perform data augmentation during training
			
 
				+  --no-augment
			
 
				+  --benchmark           Collect performance metrics during training
			
 
				+  --no-benchmark
			
 
				+  --use_amp, --amp      Train using TF-AMP
			
 
				+  --use_xla, --xla      Train using XLA
			
 
				+  --use_trt             Use TF-TRT
			
 
				+  --resume_training     Resume training from a checkpoint
			
 
				 ```
			
 
				  
			
 
				  
			
@@ -420,7 +426,7 @@ horovodrun -np <number/of/gpus> python main.py --data_dir /data [other parameter
 
				 The main result of the training are checkpoints stored by default in `./results/` on the host machine, and in the `/results` in the container. This location can be controlled
			
 
				 by the `--model_dir` command-line argument, if a different location was mounted while starting the container. In the case when the training is run in `train_and_predict` mode, the inference will take place after the training is finished, and inference results will be stored to the `/results` directory.
			
 
				  
			
 
				-If the `--exec_mode train_and_evaluate` parameter was used, and if `--crossvalidation_idx` parameter is set to an integer value of {0, 1, 2, 3, 4}, the evaluation of the validation set takes place after the training is completed. The results of the evaluation will be printed to the console.
			
 
				+If the `--exec_mode train_and_evaluate` parameter was used, and if `--fold` parameter is set to an integer value of {0, 1, 2, 3, 4}, the evaluation of the validation set takes place after the training is completed. The results of the evaluation will be printed to the console.
			
 
				 
			
 
				 ### Inference process
			
 
				  
			
--- a/TensorFlow2/Segmentation/UNet_Medical/data_loading/data_loader.py
+++ b/TensorFlow2/Segmentation/UNet_Medical/data_loading/data_loader.py
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_1GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_1GPU.sh
@@ -15,4 +15,4 @@
 
				 # This script launches U-Net run in FP32 on 1 GPU and trains for 6400 iterations with batch_size 8. Usage:
			
 
				 # bash unet_FP32_1GPU.sh <path to dataset> <path to results directory>
			
 
				 
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --log_dir $2/log.json
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --fold 0 --augment --xla --log_dir $2/log.json
			
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_8GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_8GPU.sh
@@ -15,4 +15,4 @@
 
				 # This script launches U-Net run in FP32 on 8 GPUs and trains for 6400 iterations with batch_size 8. Usage:
			
 
				 # bash unet_FP32_8GPU.sh <path to dataset> <path to results directory>
			
 
				 
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --log_dir $2/log.json
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --fold 0 --augment --xla --log_dir $2/log.json
			
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_INFER.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_INFER.sh
@@ -15,4 +15,4 @@
 
				 # This script launches U-Net run in FP32 on 1 GPU for inference batch_size 1. Usage:
			
 
				 # bash unet_INFER_FP32.sh <path to this repository> <path to dataset> <path to results directory>
			
 
				 
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size 1 --exec_mode predict --xla
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size 1 --exec_mode predict --xla --fold 0
			
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_INFER_BENCHMARK.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_INFER_BENCHMARK.sh
@@ -15,4 +15,4 @@
 
				 # This script launches U-Net run in FP32 on 1 GPU for inference benchmarking. Usage:
			
 
				 # bash unet_INFER_BENCHMARK_FP32.sh <path to dataset> <path to results directory> <batch size>
			
 
				 
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size $3 --exec_mode predict --benchmark --warmup_steps 200 --max_steps 600 --xla
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size $3 --exec_mode predict --benchmark --warmup_steps 200 --max_steps 600 --xla --fold 0
			
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_INFER_BENCHMARK_TF-AMP.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_INFER_BENCHMARK_TF-AMP.sh
@@ -15,4 +15,4 @@
 
				 # This script launches U-Net run in FP16 on 1 GPU for inference benchmarking. Usage:
			
 
				 # bash unet_INFER_BENCHMARK_TF-AMP.sh <path to dataset> <path to results directory> <batch size>
			
 
				 
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size $3 --exec_mode predict --benchmark --warmup_steps 200 --max_steps 600 --xla --amp
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size $3 --exec_mode predict --benchmark --warmup_steps 200 --max_steps 600 --xla --amp --fold 0
			
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_INFER_TF-AMP.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_INFER_TF-AMP.sh
@@ -15,4 +15,4 @@
 
				 # This script launches U-Net run in FP16 on 1 GPU for inference batch_size 1. Usage:
			
 
				 # bash unet_INFER_TF-AMP.sh <path to dataset> <path to results directory>
			
 
				 
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size 1 --exec_mode predict --xla --amp
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size 1 --exec_mode predict --xla --amp --fold 0
			
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TF-AMP_1GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TF-AMP_1GPU.sh
@@ -15,4 +15,4 @@
 
				 # This script launches U-Net run in FP16 on 1 GPU and trains for 6400 iterations batch_size 8. Usage:
			
 
				 # bash unet_TF-AMP_1GPU.sh <path to dataset> <path to results directory>
			
 
				 
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --amp --log_dir $2/log.json
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --fold 0 --augment --xla --amp --log_dir $2/log.json
			
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TF-AMP_8GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TF-AMP_8GPU.sh
@@ -15,4 +15,4 @@
 
				 # This script launches U-Net run in FP16 on 8 GPUs and trains for 6400 iterations batch_size 8. Usage:
			
 
				 # bash unet_TF-AMP_8GPU.sh <path to dataset> <path to results directory>
			
 
				 
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --amp --log_dir $2/log.json
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --fold 0 --augment --xla --amp --log_dir $2/log.json
			
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_1GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_1GPU.sh
@@ -16,9 +16,9 @@
 
				 # Usage:
			
 
				 # bash unet_TRAIN_FP32_1GPU.sh <path to dataset> <path to results directory> <batch size>
			
 
				 
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla > $2/log_FP32_1GPU_fold0.txt
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 1 --augment --xla > $2/log_FP32_1GPU_fold1.txt
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 2 --augment --xla > $2/log_FP32_1GPU_fold2.txt
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 3 --augment --xla > $2/log_FP32_1GPU_fold3.txt
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 4 --augment --xla > $2/log_FP32_1GPU_fold4.txt
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 0 --augment --xla > $2/log_FP32_1GPU_fold0.txt
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 1 --augment --xla > $2/log_FP32_1GPU_fold1.txt
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla > $2/log_FP32_1GPU_fold2.txt
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla > $2/log_FP32_1GPU_fold3.txt
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla > $2/log_FP32_1GPU_fold4.txt
			
 
				 python utils/parse_results.py --model_dir $2 --exec_mode convergence --env FP32_1GPU
			
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_8GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_8GPU.sh
@@ -16,9 +16,9 @@
 
				 # Usage:
			
 
				 # bash unet_TRAIN_FP32_8GPU.sh <path to dataset> <path to results directory> <batch size>
			
 
				 
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla > $2/log_FP32_8GPU_fold0.txt
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 1 --augment --xla > $2/log_FP32_8GPU_fold1.txt
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 2 --augment --xla > $2/log_FP32_8GPU_fold2.txt
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 3 --augment --xla > $2/log_FP32_8GPU_fold3.txt
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 4 --augment --xla > $2/log_FP32_8GPU_fold4.txt
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 0 --augment --xla > $2/log_FP32_8GPU_fold0.txt
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 1 --augment --xla > $2/log_FP32_8GPU_fold1.txt
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla > $2/log_FP32_8GPU_fold2.txt
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla > $2/log_FP32_8GPU_fold3.txt
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla > $2/log_FP32_8GPU_fold4.txt
			
 
				 python utils/parse_results.py --model_dir $2 --exec_mode convergence --env FP32_8GPU
			
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_TF-AMP_1GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_TF-AMP_1GPU.sh
@@ -16,9 +16,9 @@
 
				 # Usage:
			
 
				 # bash unet_TRAIN_TF-AMP_1GPU.sh <path to dataset> <path to results directory> <batch size>
			
 
				 
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold0.txt
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 1 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold1.txt
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 2 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold2.txt
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 3 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold3.txt
			
 
				-horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 4 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold4.txt
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 0 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold0.txt
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 1 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold1.txt
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold2.txt
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold3.txt
			
 
				+horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold4.txt
			
 
				 python utils/parse_results.py --model_dir $2 --exec_mode convergence --env TF-AMP_1GPU
			
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_TF-AMP_8GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_TF-AMP_8GPU.sh
@@ -16,9 +16,9 @@
 
				 # Usage:
			
 
				 # bash unet_TRAIN_TF-AMP_8GPU.sh <path to dataset> <path to results directory> <batch size>
			
 
				 
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold0.txt
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 1 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold1.txt
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 2 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold2.txt
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 3 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold3.txt
			
 
				-horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 4 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold4.txt
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 0 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold0.txt
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 1 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold1.txt
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold2.txt
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold3.txt
			
 
				+horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold4.txt
			
 
				 python utils/parse_results.py --model_dir $2 --exec_mode convergence --env TF-AMP_8GPU
			
--- a/TensorFlow2/Segmentation/UNet_Medical/main.py
+++ b/TensorFlow2/Segmentation/UNet_Medical/main.py
@@ -26,10 +26,10 @@ Example:
 
				 import horovod.tensorflow as hvd
			
 
				 
			
 
				 from model.unet import Unet
			
 
				-from run import train, evaluate, predict
			
 
				-from utils.setup import get_logger, set_flags, prepare_model_dir
			
 
				-from utils.cmd_util import PARSER, parse_args
			
 
				-from utils.data_loader import Dataset
			
 
				+from runtime.run import train, evaluate, predict
			
 
				+from runtime.setup import get_logger, set_flags, prepare_model_dir
			
 
				+from runtime.arguments import PARSER, parse_args
			
 
				+from data_loading.data_loader import Dataset
			
 
				 
			
 
				 
			
 
				 def main():
			
@@ -47,7 +47,7 @@ def main():
 
				 
			
 
				     dataset = Dataset(data_dir=params.data_dir,
			
 
				                       batch_size=params.batch_size,
			
 
				-                      fold=params.crossvalidation_idx,
			
 
				+                      fold=params.fold,
			
 
				                       augment=params.augment,
			
 
				                       gpu_id=hvd.rank(),
			
 
				                       num_gpus=hvd.size(),
			
--- a/TensorFlow2/Segmentation/UNet_Medical/runtime/arguments.py
+++ b/TensorFlow2/Segmentation/UNet_Medical/runtime/arguments.py
@@ -49,7 +49,7 @@ PARSER.add_argument('--learning_rate',
 
				                     default=0.0001,
			
 
				                     help="""Learning rate coefficient for AdamOptimizer""")
			
 
				 
			
 
				-PARSER.add_argument('--crossvalidation_idx',
			
 
				+PARSER.add_argument('--fold',
			
 
				                     type=int,
			
 
				                     default=None,
			
 
				                     help="""Chosen fold for cross-validation. Use None to disable cross-validation""")
			
@@ -69,6 +69,11 @@ PARSER.add_argument('--log_every',
 
				                     default=100,
			
 
				                     help="""Log performance every n steps""")
			
 
				 
			
 
				+PARSER.add_argument('--evaluate_every',
			
 
				+                    type=int,
			
 
				+                    default=0,
			
 
				+                    help="""Evaluate every n steps""")
			
 
				+
			
 
				 PARSER.add_argument('--warmup_steps',
			
 
				                     type=int,
			
 
				                     default=200,
			
@@ -110,10 +115,11 @@ def parse_args(flags):
 
				         'log_dir': flags.log_dir,
			
 
				         'batch_size': flags.batch_size,
			
 
				         'learning_rate': flags.learning_rate,
			
 
				-        'crossvalidation_idx': flags.crossvalidation_idx,
			
 
				+        'fold': flags.fold,
			
 
				         'max_steps': flags.max_steps,
			
 
				         'weight_decay': flags.weight_decay,
			
 
				         'log_every': flags.log_every,
			
 
				+        'evaluate_every': flags.evaluate_every,
			
 
				         'warmup_steps': flags.warmup_steps,
			
 
				         'augment': flags.augment,
			
 
				         'benchmark': flags.benchmark,
			
--- a/TensorFlow2/Segmentation/UNet_Medical/runtime/losses.py
+++ b/TensorFlow2/Segmentation/UNet_Medical/runtime/losses.py
--- a/TensorFlow2/Segmentation/UNet_Medical/runtime/parse_results.py
+++ b/TensorFlow2/Segmentation/UNet_Medical/runtime/parse_results.py
@@ -17,21 +17,21 @@ import numpy as np
 
				 import argparse
			
 
				 
			
 
				 
			
 
				-def process_performance_stats(timestamps, params):
			
 
				-    warmup_steps = params['warmup_steps']
			
 
				-    batch_size = params['batch_size']
			
 
				-    timestamps_ms = 1000 * timestamps[warmup_steps:]
			
 
				-    timestamps_ms = timestamps_ms[timestamps_ms > 0]
			
 
				-    latency_ms = timestamps_ms.mean()
			
 
				-    std = timestamps_ms.std()
			
 
				-    n = np.sqrt(len(timestamps_ms))
			
 
				+def process_performance_stats(timestamps, batch_size, mode):
			
 
				+    """ Get confidence intervals
			
 
				+
			
 
				+    :param timestamps: Collection of timestamps
			
 
				+    :param batch_size: Number of samples per batch
			
 
				+    :param mode: Estimator's execution mode
			
 
				+    :return: Stats
			
 
				+    """
			
 
				+    timestamps_ms = 1000 * timestamps
			
 
				     throughput_imgps = (1000.0 * batch_size / timestamps_ms).mean()
			
 
				+    stats = {f"throughput_{mode}": throughput_imgps,
			
 
				+             f"latency_{mode}_mean": timestamps_ms.mean()}
			
 
				+    for level in [90, 95, 99]:
			
 
				+        stats.update({f"latency_{mode}_{level}": np.percentile(timestamps_ms, level)})
			
 
				 
			
 
				-    stats = [("Throughput Avg", str(throughput_imgps)),
			
 
				-             ('Latency Avg:', str(latency_ms))]
			
 
				-    for ci, lvl in zip(["90%:", "95%:", "99%:"],
			
 
				-                       [1.645, 1.960, 2.576]):
			
 
				-        stats.append(("Latency_"+ci, str(latency_ms + lvl * std / n)))
			
 
				     return stats
			
 
				 
			
 
				 
			
--- a/TensorFlow2/Segmentation/UNet_Medical/runtime/run.py
+++ b/TensorFlow2/Segmentation/UNet_Medical/runtime/run.py
@@ -19,8 +19,8 @@ from PIL import Image
 
				 import horovod.tensorflow as hvd
			
 
				 import tensorflow as tf
			
 
				 
			
 
				-from utils.losses import partial_losses
			
 
				-from utils.parse_results import process_performance_stats
			
 
				+from runtime.losses import partial_losses
			
 
				+from runtime.parse_results import process_performance_stats
			
 
				 
			
 
				 
			
 
				 def train(params, model, dataset, logger):
			
@@ -35,7 +35,7 @@ def train(params, model, dataset, logger):
 
				     ce_loss = tf.keras.metrics.Mean(name='ce_loss')
			
 
				     f1_loss = tf.keras.metrics.Mean(name='dice_loss')
			
 
				     checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
			
 
				-    if params.resume_training:
			
 
				+    if params.resume_training and params.model_dir:
			
 
				         checkpoint.restore(tf.train.latest_checkpoint(params.model_dir))
			
 
				 
			
 
				     @tf.function
			
@@ -69,26 +69,30 @@ def train(params, model, dataset, logger):
 
				     if params.benchmark:
			
 
				         assert max_steps * hvd.size() > params.warmup_steps, \
			
 
				             "max_steps value has to be greater than warmup_steps"
			
 
				-        timestamps = np.zeros((hvd.size(), max_steps * hvd.size() + 1), dtype=np.float32)
			
 
				+        timestamps = []
			
 
				         for iteration, (images, labels) in enumerate(dataset.train_fn(drop_remainder=True)):
			
 
				-            t0 = time()
			
 
				             loss = train_step(images, labels, warmup_batch=iteration == 0).numpy()
			
 
				-            timestamps[hvd.rank(), iteration] = time() - t0
			
 
				+            if iteration > params.warmup_steps:
			
 
				+                timestamps.append(time())
			
 
				             if iteration >= max_steps * hvd.size():
			
 
				                 break
			
 
				-        timestamps = np.mean(timestamps, axis=0)
			
 
				+
			
 
				         if hvd.rank() == 0:
			
 
				-            stats = process_performance_stats(timestamps, params)
			
 
				-            logger.log(step=(),
			
 
				-                       data={metric: value for (metric, value) in stats})
			
 
				+            deltas = np.array([timestamps[i + 1] - timestamps[i] for i in range(len(timestamps) - 1)])
			
 
				+            stats = process_performance_stats(deltas, hvd.size() * params.batch_size, mode="train")
			
 
				+            logger.log(step=(), data=stats)
			
 
				     else:
			
 
				         for iteration, (images, labels) in enumerate(dataset.train_fn()):
			
 
				             train_step(images, labels, warmup_batch=iteration == 0)
			
 
				-            if (hvd.rank() == 0) and (iteration % params.log_every == 0):
			
 
				-                logger.log(step=(iteration, max_steps),
			
 
				-                           data={"train_ce_loss": float(ce_loss.result()),
			
 
				-                                 "train_dice_loss": float(f1_loss.result()),
			
 
				-                                 "train_total_loss": float(f1_loss.result() + ce_loss.result())})
			
 
				+            if hvd.rank() == 0:
			
 
				+                if iteration % params.log_every == 0:
			
 
				+                    logger.log(step=(iteration, max_steps),
			
 
				+                               data={"train_ce_loss": float(ce_loss.result()),
			
 
				+                                     "train_dice_loss": float(f1_loss.result()),
			
 
				+                                     "train_total_loss": float(f1_loss.result() + ce_loss.result())})
			
 
				+
			
 
				+                if (params.evaluate_every > 0) and (iteration % params.evaluate_every == 0):
			
 
				+                    evaluate(params, model, dataset, logger, restore_checkpoint=False)
			
 
				 
			
 
				                 f1_loss.reset_states()
			
 
				                 ce_loss.reset_states()
			
@@ -101,13 +105,15 @@ def train(params, model, dataset, logger):
 
				     logger.flush()
			
 
				 
			
 
				 
			
 
				-def evaluate(params, model, dataset, logger):
			
 
				+def evaluate(params, model, dataset, logger, restore_checkpoint=True):
			
 
				+    if params.fold is None:
			
 
				+        print("No fold specified for evaluation. Please use --fold [int] to select a fold.")
			
 
				     ce_loss = tf.keras.metrics.Mean(name='ce_loss')
			
 
				     f1_loss = tf.keras.metrics.Mean(name='dice_loss')
			
 
				     checkpoint = tf.train.Checkpoint(model=model)
			
 
				-    checkpoint.restore(tf.train.latest_checkpoint(params.model_dir)).expect_partial()
			
 
				+    if params.model_dir and restore_checkpoint:
			
 
				+        checkpoint.restore(tf.train.latest_checkpoint(params.model_dir)).expect_partial()
			
 
				 
			
 
				-    @tf.function
			
 
				     def validation_step(features, labels):
			
 
				         output_map = model(features, training=False)
			
 
				         crossentropy_loss, dice_loss = partial_losses(output_map, labels)
			
@@ -130,7 +136,8 @@ def evaluate(params, model, dataset, logger):
 
				 
			
 
				 def predict(params, model, dataset, logger):
			
 
				     checkpoint = tf.train.Checkpoint(model=model)
			
 
				-    checkpoint.restore(tf.train.latest_checkpoint(params.model_dir)).expect_partial()
			
 
				+    if params.model_dir:
			
 
				+        checkpoint.restore(tf.train.latest_checkpoint(params.model_dir)).expect_partial()
			
 
				 
			
 
				     @tf.function
			
 
				     def prediction_step(features):
			
@@ -139,16 +146,16 @@ def predict(params, model, dataset, logger):
 
				     if params.benchmark:
			
 
				         assert params.max_steps > params.warmup_steps, \
			
 
				             "max_steps value has to be greater than warmup_steps"
			
 
				-        timestamps = np.zeros(params.max_steps + 1, dtype=np.float32)
			
 
				+        timestamps = []
			
 
				         for iteration, images in enumerate(dataset.test_fn(count=None, drop_remainder=True)):
			
 
				-            t0 = time()
			
 
				             prediction_step(images)
			
 
				-            timestamps[iteration] = time() - t0
			
 
				+            timestamps.append(time())
			
 
				             if iteration >= params.max_steps:
			
 
				                 break
			
 
				-        stats = process_performance_stats(timestamps, params)
			
 
				-        logger.log(step=(),
			
 
				-                   data={metric: value for (metric, value) in stats})
			
 
				+
			
 
				+        deltas = np.array([timestamps[i + 1] - timestamps[i] for i in range(len(timestamps) - 1)])
			
 
				+        stats = process_performance_stats(deltas, params.batch_size, mode="test")
			
 
				+        logger.log(step=(), data=stats)
			
 
				     else:
			
 
				         predictions = np.concatenate([prediction_step(images).numpy()
			
 
				                                       for images in dataset.test_fn(count=1)], axis=0)
			
@@ -163,4 +170,6 @@ def predict(params, model, dataset, logger):
 
				                               compression="tiff_deflate",
			
 
				                               save_all=True,
			
 
				                               append_images=multipage_tif[1:])
			
 
				+
			
 
				+        print("Predictions saved at {}".format(output_dir))
			
 
				     logger.flush()
			
--- a/TensorFlow2/Segmentation/UNet_Medical/runtime/setup.py
+++ b/TensorFlow2/Segmentation/UNet_Medical/runtime/setup.py
@@ -13,11 +13,13 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 import os
			
 
				+import multiprocessing
			
 
				+
			
 
				 import numpy as np
			
 
				 import tensorflow as tf
			
 
				+import horovod.tensorflow as hvd
			
 
				 
			
 
				 import dllogger as logger
			
 
				-import horovod.tensorflow as hvd
			
 
				 from dllogger import StdOutBackend, Verbosity, JSONStreamBackend
			
 
				 
			
 
				 
			
@@ -32,6 +34,7 @@ def set_flags(params):
 
				     os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
			
 
				     os.environ['TF_SYNC_ON_FINISH'] = '0'
			
 
				     os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
			
 
				+    os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '0'
			
 
				 
			
 
				     np.random.seed(params.seed)
			
 
				     tf.random.set_seed(params.seed)
			
@@ -45,10 +48,11 @@ def set_flags(params):
 
				     if gpus:
			
 
				         tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
			
 
				 
			
 
				+    tf.config.threading.set_intra_op_parallelism_threads(1)
			
 
				+    tf.config.threading.set_inter_op_parallelism_threads(max(2, (multiprocessing.cpu_count() // hvd.size()) - 2))
			
 
				+
			
 
				     if params.use_amp:
			
 
				         tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
			
 
				-    else:
			
 
				-        os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '0'
			
 
				 
			
 
				 
			
 
				 def prepare_model_dir(params):