5 år sedan · b2e7f4a284
--- a/TensorFlow/Classification/ConvNets/main.py
+++ b/TensorFlow/Classification/ConvNets/main.py
@@ -99,7 +99,7 @@ if __name__ == "__main__":
 
				             symmetric=FLAGS.symmetric,
			
 
				             quant_delay = FLAGS.quant_delay,
			
 
				             use_qdq = FLAGS.use_qdq,
			
 
				-            finetune_checkpoint = FLAGS.finetune_checkpoint,
			
 
				+            finetune_checkpoint=FLAGS.finetune_checkpoint,
			
 
				         )
			
 
				 
			
 
				     if FLAGS.mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']:
			
--- a/TensorFlow/Classification/ConvNets/model/resnet.py
+++ b/TensorFlow/Classification/ConvNets/model/resnet.py
@@ -188,6 +188,9 @@ class ResnetModel(object):
 
				                 use_final_conv=params['use_final_conv']
			
 
				             )
			
 
				             
			
 
				+            if mode!=tf.estimator.ModeKeys.PREDICT:
			
 
				+                logits = tf.squeeze(logits)
			
 
				+
			
 
				             if mode!=tf.estimator.ModeKeys.PREDICT:
			
 
				                 logits = tf.squeeze(logits)
			
 
				 
			
@@ -201,7 +204,7 @@ class ResnetModel(object):
 
				             tf.identity(logits, name="logits_ref")
			
 
				             tf.identity(probs, name="probs_ref")
			
 
				             tf.identity(y_preds, name="y_preds_ref")
			
 
				-            
			
 
				+
			
 
				             if mode == tf.estimator.ModeKeys.TRAIN and params['quantize']:
			
 
				                 dllogger.log(data={"QUANTIZATION AWARE TRAINING ENABLED": True}, step=tuple())
			
 
				                 if params['symmetric']:
			
@@ -219,7 +222,7 @@ class ResnetModel(object):
 
				                     train_var_dict[var.op.name] = var
			
 
				                 dllogger.log(data={"Restoring variables from checkpoint": params['finetune_checkpoint']}, step=tuple())
			
 
				                 tf.train.init_from_checkpoint(params['finetune_checkpoint'], train_var_dict)
			
 
				-                
			
 
				+
			
 
				         if mode == tf.estimator.ModeKeys.PREDICT:
			
 
				 
			
 
				             predictions = {'classes': y_preds, 'probabilities': probs}
			
@@ -458,7 +461,7 @@ class ResnetModel(object):
 
				 
			
 
				                 if logits.dtype != tf.float32:
			
 
				                     logits = tf.cast(logits, tf.float32)
			
 
				-                    
			
 
				+
			
 
				                 axis = 3 if self.model_hparams.compute_format=="NHWC" and use_final_conv else 1
			
 
				                 probs = layers.softmax(logits, name="softmax", axis=axis)
			
 
				 
			
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/README.md
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/README.md
@@ -20,6 +20,8 @@ This repository provides a script and recipe to train the ResNet-50 v1.5 model t
 
				     * [Parameters](#parameters)
			
 
				         * [The `main.py` script](#the-mainpy-script)
			
 
				     * [Quantization Aware training](#quantization-aware-training)
			
 
				+        * [Post process checkpoint](#post-process-checkpoint)
			
 
				+        * [Exporting Frozen graphs](#exporting-frozen-graphs)
			
 
				     * [Inference process](#inference-process)
			
 
				 * [Performance](#performance)
			
 
				     * [Benchmarking](#benchmarking)
			
@@ -200,7 +202,16 @@ cd DeepLearningExamples/TensorFlow/Classification/ConvNets
 
				 2. Download and preprocess the dataset.
			
 
				 The ResNet50 v1.5 script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
			
 
				 
			
 
				-To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
			
 
				+* [Download the images](http://image-net.org/download-images)
			
 
				+* Extract the training and validation data:
			
 
				+```bash
			
 
				+mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
			
 
				+tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
			
 
				+find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
			
 
				+cd ..
			
 
				+mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
			
 
				+```
			
 
				+* Preprocess dataset to TFRecord form using [script](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/build_imagenet_data.py). Additional metadata from [autors repository](https://github.com/tensorflow/models/tree/archive/research/inception/inception/data) might be required.
			
 
				 
			
 
				 3. Build the ResNet-50 v1.5 TensorFlow NGC container.
			
 
				 ```bash
			
@@ -400,7 +411,7 @@ operations for `tf.contrib.quantize.experimental_create_training_graph` has been
 
				      * `--output` : Name of the new checkpoint file which has the FC layer weights reshaped into 1x1 conv layer weights.
			
 
				      * `--dense_layer` : Name of the FC layer
			
 
				 
			
 
				-### Exporting Frozen graphs
			
 
				+#### Exporting Frozen graphs
			
 
				 To export frozen graphs (which can be used for inference with <a href="https://developer.nvidia.com/tensorrt">TensorRT</a>), use:
			
 
				 
			
 
				 `python export_frozen_graph.py --checkpoint <path_to_checkpoint> --quantize --use_final_conv --use_qdq --symmetric --input_format NCHW --compute_format NCHW --output_file=<output_file_name>`
			
@@ -452,7 +463,7 @@ To benchmark the training performance on a specific batch size, run:
 
				 Each of these scripts runs 200 warm-up iterations and measures the first epoch.
			
 
				 
			
 
				 To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
			
 
				-with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
			
 
				+with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
			
 
				 Suggested batch sizes for training are 256 for mixed precision training and 128 for single precision training per single V100 16 GB.
			
 
				 
			
 
				 #### Inference performance benchmark
			
@@ -468,8 +479,8 @@ To benchmark the inference performance on a specific batch size, run:
 
				 `python ./main.py --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
			
 
				 
			
 
				 By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
			
 
				-To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
			
 
				-If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
			
 
				+To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. 
			
 
				+If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
			
 
				 
			
 
				 The benchmark can be automated with the `inference_benchmark.sh` script provided in `resnet50v1.5`, by simply running:
			
 
				 `bash ./resnet50v1.5/inference_benchmark.sh <data dir> <data idx dir>`
			
@@ -518,8 +529,8 @@ on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per secon
 
				 
			
 
				 | GPUs | Batch Size / GPU | Throughput - TF32 + XLA | Throughput - mixed precision + XLA | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 + XLA | Weak scaling - mixed precision + XLA |
			
 
				 |----|---------------|---------------|------------------------|-----------------|-----------|-------------------|
			
 
				-| 1  | 256 | 808 img/s  | 1770 img/s    | 2.20x           | 1.00x     | 1.00x             |
			
 
				-| 8  | 256 | 6300 img/s | 16400 img/s   | 2.60x           | 7.79x     | 9.26x             |
			
 
				+| 1  | 256 | 909 img/s  | 2375 img/s    | 2.60x           | 1.00x     | 1.00x             |
			
 
				+| 8  | 256 | 7000 img/s | 17400 img/s   | 2.48x           | 7.70x     | 7.32x             |
			
 
				 
			
 
				 ##### Training performance: NVIDIA DGX-1 (8x V100 16G)
			
 
				 Our results were obtained by running the `resnet50v1.5/training/training_perf.sh` benchmark script in the 
			
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md
@@ -209,7 +209,16 @@ cd DeepLearningExamples/TensorFlow/Classification/ConvNets
 
				 2. Download and preprocess the dataset.
			
 
				 The ResNext101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
			
 
				 
			
 
				-To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
			
 
				+* [Download the images](http://image-net.org/download-images)
			
 
				+* Extract the training and validation data:
			
 
				+```bash
			
 
				+mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
			
 
				+tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
			
 
				+find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
			
 
				+cd ..
			
 
				+mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
			
 
				+```
			
 
				+* Preprocess dataset to TFRecord form using [script](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/build_imagenet_data.py). Additional metadata from [autors repository](https://github.com/tensorflow/models/tree/archive/research/inception/inception/data) might be required.
			
 
				 
			
 
				 3. Build the ResNext101-32x4d TensorFlow NGC container.
			
 
				 ```bash
			
@@ -420,7 +429,7 @@ To benchmark the training performance on a specific batch size, run:
 
				 Each of these scripts runs 200 warm-up iterations and measures the first epoch.
			
 
				 
			
 
				 To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
			
 
				-with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
			
 
				+with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
			
 
				 Suggested batch sizes for training are 128 for mixed precision training and 64 for single precision training per single V100 16 GB.
			
 
				 
			
 
				 
			
@@ -438,7 +447,7 @@ To benchmark the inference performance on a specific batch size, run:
 
				 
			
 
				 By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
			
 
				 To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
			
 
				-If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
			
 
				+If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
			
 
				 
			
 
				 The benchmark can be automated with the `inference_benchmark.sh` script provided in `resnext101-32x4d`, by simply running:
			
 
				 `bash ./resnext101-32x4d/inference_benchmark.sh <data dir> <data idx dir>`
			
@@ -487,8 +496,8 @@ on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per secon
 
				 
			
 
				 | GPUs | Batch Size / GPU | Throughput - TF32 + XLA | Throughput - mixed precision + XLA | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 + XLA| Weak scaling - mixed precision + XLA |
			
 
				 |----|---------------|---------------|------------------------|-----------------|-----------|-------------------|
			
 
				-| 1  | 128 (TF) / 256 (AMP) | 340 img/s  | 905 img/s    | 2.66x           | 1.00x     | 1.00x             |
			
 
				-| 8  | 128 (TF) / 256 (AMP) | 2630 img/s | 8000 img/s   | 3.05x           | 7.73x     | 8.84x             |
			
 
				+| 1  | 128 (TF) / 256 (AMP) | 371 img/s  | 1132 img/s    | 3.05x           | 1.00x     | 1.00x             |
			
 
				+| 8  | 128 (TF) / 256 (AMP) | 2854 img/s | 8500 img/s   | 2.98x           | 7.69x     | 7.51x             |
			
 
				 
			
 
				 
			
 
				 ##### Training performance: NVIDIA DGX-1 (8x V100 16G)
			
--- a/TensorFlow/Classification/ConvNets/runtime/runner.py
+++ b/TensorFlow/Classification/ConvNets/runtime/runner.py
@@ -95,7 +95,7 @@ class Runner(object):
 
				         #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
			
 
				 
			
 
				         os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
			
 
				-        os.environ['TF_GPU_THREAD_COUNT'] = '1' if not hvd_utils.is_using_hvd() else str(hvd.size())
			
 
				+        os.environ['TF_GPU_THREAD_COUNT'] = '2'
			
 
				 
			
 
				         os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
			
 
				 
			
@@ -246,11 +246,7 @@ class Runner(object):
 
				 
			
 
				         if mode == 'train':
			
 
				             config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads
			
 
				-
			
 
				-            if hvd_utils.is_using_hvd():
			
 
				-                config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // hvd.size()) - 2)
			
 
				-            else:
			
 
				-                config.inter_op_parallelism_threads = 4
			
 
				+            config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // max(hvd.size(), 8) - 2))
			
 
				 
			
 
				         return config
			
 
				 
			
@@ -407,7 +403,7 @@ class Runner(object):
 
				 
			
 
				             if is_benchmark:
			
 
				                 self.training_logging_hook = hooks.BenchmarkLoggingHook(
			
 
				-                    global_batch_size=global_batch_size, warmup_steps=warmup_steps
			
 
				+                    global_batch_size=global_batch_size, warmup_steps=warmup_steps, logging_steps=log_every_n_steps
			
 
				                 )
			
 
				             else:
			
 
				                 self.training_logging_hook = hooks.TrainingLoggingHook(
			
@@ -415,7 +411,8 @@ class Runner(object):
 
				                     num_steps=num_steps,
			
 
				                     num_samples=num_samples,
			
 
				                     num_epochs=num_epochs,
			
 
				-                    steps_per_epoch=steps_per_epoch
			
 
				+                    steps_per_epoch=steps_per_epoch,
			
 
				+                    logging_steps=log_every_n_steps
			
 
				                 )
			
 
				             training_hooks.append(self.training_logging_hook)
			
 
				 
			
@@ -446,10 +443,10 @@ class Runner(object):
 
				             'symmetric': symmetric,
			
 
				             'quant_delay': quant_delay
			
 
				         }
			
 
				-        
			
 
				+
			
 
				         if finetune_checkpoint:
			
 
				-           estimator_params['finetune_checkpoint']=finetune_checkpoint
			
 
				-        
			
 
				+            estimator_params['finetune_checkpoint'] = finetune_checkpoint
			
 
				+
			
 
				         image_classifier = self._get_estimator(
			
 
				             mode='train',
			
 
				             run_params=estimator_params,
			
@@ -589,7 +586,9 @@ class Runner(object):
 
				         eval_hooks = []
			
 
				 
			
 
				         if hvd.rank() == 0:
			
 
				-            self.eval_logging_hook = hooks.BenchmarkLoggingHook(global_batch_size=batch_size, warmup_steps=warmup_steps)
			
 
				+            self.eval_logging_hook = hooks.BenchmarkLoggingHook(
			
 
				+                global_batch_size=batch_size, warmup_steps=warmup_steps, logging_steps=log_every_n_steps
			
 
				+            )
			
 
				             eval_hooks.append(self.eval_logging_hook)
			
 
				 
			
 
				             print('Starting Model Evaluation...')
			
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/README.md
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/README.md
@@ -204,7 +204,16 @@ cd DeepLearningExamples/TensorFlow/Classification/ConvNets
 
				 2. Download and preprocess the dataset.
			
 
				 The SE-ResNext101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
			
 
				 
			
 
				-To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
			
 
				+* [Download the images](http://image-net.org/download-images)
			
 
				+* Extract the training and validation data:
			
 
				+```bash
			
 
				+mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
			
 
				+tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
			
 
				+find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
			
 
				+cd ..
			
 
				+mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
			
 
				+```
			
 
				+* Preprocess dataset to TFRecord form using [script](https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/build_imagenet_data.py). Additional metadata from [autors repository](https://github.com/tensorflow/models/tree/archive/research/inception/inception/data) might be required.
			
 
				 
			
 
				 3. Build the SE-ResNext101-32x4d TensorFlow NGC container.
			
 
				 ```bash
			
@@ -415,7 +424,7 @@ To benchmark the training performance on a specific batch size, run:
 
				 Each of these scripts runs 200 warm-up iterations and measures the first epoch.
			
 
				 
			
 
				 To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
			
 
				-with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
			
 
				+with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
			
 
				 Suggested batch sizes for training are 96 for mixed precision training and 64 for single precision training per single V100 16 GB.
			
 
				 
			
 
				 
			
@@ -433,7 +442,7 @@ To benchmark the inference performance on a specific batch size, run:
 
				 
			
 
				 By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
			
 
				 To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
			
 
				-If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. 
			
 
				+If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
			
 
				 
			
 
				 The benchmark can be automated with the `inference_benchmark.sh` script provided in `se-resnext101-32x4d`, by simply running:
			
 
				 `bash ./se-resnext101-32x4d/inference_benchmark.sh <data dir> <data idx dir>`
			
@@ -482,8 +491,8 @@ on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per secon
 
				 
			
 
				 | GPUs | Batch Size / GPU | Throughput - TF32 + XLA | Throughput - mixed precision + XLA | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 + XLA | Weak scaling - mixed precision + XLA |
			
 
				 |----|---------------|---------------|------------------------|-----------------|-----------|-------------------|
			
 
				-| 1  | 128 (TF) / 256 (AMP) | 313 img/s  | 895 img/s    | 2.86x           | 1.00x     | 1.00x             |
			
 
				-| 8  | 128 (TF) / 256 (AMP) | 2400 img/s | 6930 img/s   | 2.88x           | 7.66x     | 7.74x             |
			
 
				+| 1  | 128 (TF) / 256 (AMP) | 342 img/s  | 975 img/s    | 2.86x           | 1.00x     | 1.00x             |
			
 
				+| 8  | 128 (TF) / 256 (AMP) | 2610 img/s | 7230 img/s   | 2.77x           | 7.63x     | 7.41x             |
			
 
				 
			
 
				 ##### Training performance: NVIDIA DGX-1 (8x V100 16G)
			
 
				 Our results were obtained by running the `se-resnext101-32x4d/training/training_perf.sh` benchmark script in the 
			
--- a/TensorFlow/Classification/ConvNets/utils/bind_dgx_a100.sh
+++ b/TensorFlow/Classification/ConvNets/utils/bind_dgx_a100.sh
@@ -0,0 +1,20 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+if [[ -v SLURM_LOCALID ]]; then
			
 
				+    echo "Bind using slurm localid"
			
 
				+    LOCAL_ID=$SLURM_LOCALID
			
 
				+elif [[ -v OMPI_COMM_WORLD_LOCAL_RANK ]]; then
			
 
				+    echo "Bind using OpenMPI env"
			
 
				+    LOCAL_ID=$OMPI_COMM_WORLD_LOCAL_RANK
			
 
				+else
			
 
				+    echo "Bind to first node"
			
 
				+    LOCAL_ID=0
			
 
				+fi
			
 
				+
			
 
				+case $LOCAL_ID in
			
 
				+    0|1) exec numactl --cpunodebind=3 --membind=3 $@;;
			
 
				+    2|3) exec numactl --cpunodebind=1 --membind=1 $@;;
			
 
				+    4|5) exec numactl --cpunodebind=7 --membind=7 $@;;
			
 
				+    6|7) exec numactl --cpunodebind=5 --membind=5 $@;;
			
 
				+    *) echo "unknown binding"; exec $@;;
			
 
				+esac
			
--- a/TensorFlow/Classification/ConvNets/utils/cmdline_helper.py
+++ b/TensorFlow/Classification/ConvNets/utils/cmdline_helper.py
@@ -173,6 +173,51 @@ def parse_cmdline(available_arch):
 
				         help="Quantize weights and activations during training using symmetric quantization."
			
 
				     )
			
 
				 
			
 
				+    p.add_argument(
			
 
				+        '--finetune_checkpoint',
			
 
				+        required=False,
			
 
				+        default=None,
			
 
				+        type=str,
			
 
				+        help="Path to pre-trained checkpoint which will be used for fine-tuning"
			
 
				+    )
			
 
				+    
			
 
				+    _add_bool_argument(
			
 
				+        parser=p, name="use_final_conv", default=False, required=False, help="Use cosine learning rate schedule."
			
 
				+    )
			
 
				+
			
 
				+    p.add_argument(
			
 
				+        '--quant_delay',
			
 
				+        type=int,
			
 
				+        default=0,
			
 
				+        required=False,
			
 
				+        help="Number of steps to be run before quantization starts to happen"
			
 
				+    )
			
 
				+
			
 
				+    _add_bool_argument(
			
 
				+        parser=p,
			
 
				+        name="quantize",
			
 
				+        default=False,
			
 
				+        required=False,
			
 
				+        help="Quantize weights and activations during training. (Defaults to Assymmetric quantization)"
			
 
				+    )
			
 
				+
			
 
				+    _add_bool_argument(
			
 
				+        parser=p,
			
 
				+        name="use_qdq",
			
 
				+        default=False,
			
 
				+        required=False,
			
 
				+        help="Use QDQV3 op instead of FakeQuantWithMinMaxVars op for quantization. QDQv3 does only scaling"
			
 
				+    )
			
 
				+
			
 
				+    _add_bool_argument(
			
 
				+        parser=p,
			
 
				+        name="symmetric",
			
 
				+        default=False,
			
 
				+        required=False,
			
 
				+        help="Quantize weights and activations during training using symmetric quantization."
			
 
				+    )
			
 
				+
			
 
				+
			
 
				     p.add_argument(
			
 
				         '--log_filename',
			
 
				         type=str,
			
@@ -183,7 +228,7 @@ def parse_cmdline(available_arch):
 
				 
			
 
				     p.add_argument(
			
 
				         '--display_every',
			
 
				-        default=10,
			
 
				+        default=1,
			
 
				         type=int,
			
 
				         required=False,
			
 
				         help="""How often (in batches) to print out running information."""
			
--- a/TensorFlow/Classification/ConvNets/utils/hooks/benchmark_hooks.py
+++ b/TensorFlow/Classification/ConvNets/utils/hooks/benchmark_hooks.py
@@ -22,19 +22,19 @@ import dllogger
 
				 
			
 
				 from .training_hooks import MeanAccumulator
			
 
				 
			
 
				-
			
 
				 __all__ = ['BenchmarkLoggingHook']
			
 
				 
			
 
				 
			
 
				 class BenchmarkLoggingHook(tf.train.SessionRunHook):
			
 
				 
			
 
				-    def __init__(self, global_batch_size, warmup_steps=20):
			
 
				+    def __init__(self, global_batch_size, warmup_steps=20, logging_steps=1):
			
 
				         self.latencies = []
			
 
				         self.warmup_steps = warmup_steps
			
 
				         self.global_batch_size = global_batch_size
			
 
				         self.current_step = 0
			
 
				         self.t0 = None
			
 
				         self.mean_throughput = MeanAccumulator()
			
 
				+        self.logging_steps = logging_steps
			
 
				 
			
 
				     def before_run(self, run_context):
			
 
				         self.t0 = time.time()
			
@@ -46,7 +46,7 @@ class BenchmarkLoggingHook(tf.train.SessionRunHook):
 
				             self.latencies.append(batch_time)
			
 
				             self.mean_throughput.consume(ips)
			
 
				 
			
 
				-            dllogger.log(data={"total_ips" : ips},
			
 
				-                         step=(0, self.current_step))
			
 
				+            if (self.current_step % self.logging_steps) == 0:
			
 
				+                dllogger.log(data={"total_ips": ips}, step=(0, self.current_step))
			
 
				 
			
 
				         self.current_step += 1
			
--- a/TensorFlow/Classification/ConvNets/utils/hooks/training_hooks.py
+++ b/TensorFlow/Classification/ConvNets/utils/hooks/training_hooks.py
@@ -43,13 +43,16 @@ class MeanAccumulator:
 
				 
			
 
				 class TrainingLoggingHook(tf.train.SessionRunHook):
			
 
				 
			
 
				-    def __init__(self, global_batch_size, num_steps, num_samples, num_epochs, steps_per_epoch, warmup_steps=20):
			
 
				+    def __init__(
			
 
				+        self, global_batch_size, num_steps, num_samples, num_epochs, steps_per_epoch, warmup_steps=20, logging_steps=1
			
 
				+    ):
			
 
				         self.global_batch_size = global_batch_size
			
 
				         self.num_steps = num_steps
			
 
				         self.num_samples = num_samples
			
 
				         self.num_epochs = num_epochs
			
 
				         self.steps_per_epoch = steps_per_epoch
			
 
				         self.warmup_steps = warmup_steps
			
 
				+        self.logging_steps = logging_steps
			
 
				 
			
 
				         self.current_step = 0
			
 
				         self.current_epoch = 0
			
@@ -89,8 +92,9 @@ class TrainingLoggingHook(tf.train.SessionRunHook):
 
				         if self.current_step >= self.warmup_steps:
			
 
				             self.mean_throughput.consume(metrics['imgs_per_sec'])
			
 
				 
			
 
				-            metrics = {k: float(v) for k, v in metrics.items()}
			
 
				-            dllogger.log(data=metrics, step=(int(global_step // self.steps_per_epoch), int(global_step)))
			
 
				+            if (self.current_step % self.logging_steps) == 0:
			
 
				+                metrics = {k: float(v) for k, v in metrics.items()}
			
 
				+                dllogger.log(data=metrics, step=(int(global_step // self.steps_per_epoch), int(global_step)))
			
 
				 
			
 
				         self.current_step += 1