Przemek Strzelczyk 7 лет назад
Родитель
Сommit
3d90b6dbf0

+ 2 - 2
TensorFlow/Detection/SSD/configs/ssd320_bench.config

@@ -172,7 +172,7 @@ train_config: {
 
 train_input_reader: {
   tf_record_input_reader {
-    input_path: "/data/coco_train.record*"
+    input_path: "/data/*train*"
   }
   label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
 }
@@ -185,7 +185,7 @@ eval_config: {
 
 eval_input_reader: {
   tf_record_input_reader {
-    input_path: "/data/coco_val.record*"
+    input_path: "/data/*val*"
   }
   label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
   shuffle: false

+ 0 - 193
TensorFlow/Detection/SSD/configs/ssd320_double_1gpus.config

@@ -1,193 +0,0 @@
-# SSD with Resnet 50 v1 FPN feature extractor, shared box predictor and focal
-# loss (a.k.a Retinanet).
-# See Lin et al, https://arxiv.org/abs/1708.02002
-# Trained on COCO, initialized from Imagenet classification checkpoint
-
-model {
-  ssd {
-    inplace_batchnorm_update: true
-    freeze_batchnorm: true
-    num_classes: 90
-    box_coder {
-      faster_rcnn_box_coder {
-        y_scale: 10.0
-        x_scale: 10.0
-        height_scale: 5.0
-        width_scale: 5.0
-      }
-    }
-    matcher {
-      argmax_matcher {
-        matched_threshold: 0.5
-        unmatched_threshold: 0.5
-        ignore_thresholds: false
-        negatives_lower_than_unmatched: true
-        force_match_for_each_row: true
-        use_matmul_gather: true
-      }
-    }
-    similarity_calculator {
-      iou_similarity {
-      }
-    }
-    encode_background_as_zeros: true
-    anchor_generator {
-      multiscale_anchor_generator {
-        min_level: 3
-        max_level: 7
-        anchor_scale: 4.0
-        aspect_ratios: [1.0, 2.0, 0.5]
-        scales_per_octave: 2
-      }
-    }
-    image_resizer {
-      fixed_shape_resizer {
-        height: 320
-        width: 320
-      }
-    }
-    box_predictor {
-      weight_shared_convolutional_box_predictor {
-        depth: 256
-        class_prediction_bias_init: -4.6
-        conv_hyperparams {
-          activation: RELU_6,
-          regularizer {
-            l2_regularizer {
-              weight: 0.0004
-            }
-          }
-          initializer {
-            random_normal_initializer {
-              stddev: 0.01
-              mean: 0.0
-            }
-          }
-          batch_norm {
-            scale: true,
-            decay: 0.997,
-            epsilon: 0.001,
-          }
-        }
-        num_layers_before_predictor: 4
-        kernel_size: 3
-      }
-    }
-    feature_extractor {
-      type: 'ssd_resnet50_v1_fpn'
-      fpn {
-        min_level: 3
-        max_level: 7
-      }
-      min_depth: 16
-      depth_multiplier: 1.0
-      conv_hyperparams {
-        activation: RELU_6,
-        regularizer {
-          l2_regularizer {
-            weight: 0.0004
-          }
-        }
-        initializer {
-          truncated_normal_initializer {
-            stddev: 0.03
-            mean: 0.0
-          }
-        }
-        batch_norm {
-          scale: true,
-          decay: 0.997,
-          epsilon: 0.001,
-        }
-      }
-      override_base_feature_extractor_hyperparams: true
-    }
-    loss {
-      classification_loss {
-        weighted_sigmoid_focal {
-          alpha: 0.25
-          gamma: 2.0
-        }
-      }
-      localization_loss {
-        weighted_smooth_l1 {
-        }
-      }
-      classification_weight: 1.0
-      localization_weight: 1.0
-    }
-    normalize_loss_by_num_matches: true
-    normalize_loc_loss_by_codesize: true
-    post_processing {
-      batch_non_max_suppression {
-        score_threshold: 1e-8
-        iou_threshold: 0.6
-        max_detections_per_class: 100
-        max_total_detections: 100
-      }
-      score_converter: SIGMOID
-    }
-  }
-}
-
-train_config: {
-  fine_tune_checkpoint: "/checkpoints/resnet_v1_50/model.ckpt"
-  fine_tune_checkpoint_type: "classification"
-  batch_size: 32 
-  sync_replicas: true
-  startup_delay_steps: 0
-  replicas_to_aggregate: 8
-  num_steps: 200000
-  data_augmentation_options {
-    random_horizontal_flip {
-    }
-  }
-  data_augmentation_options {
-    random_crop_image {
-      min_object_covered: 0.0
-      min_aspect_ratio: 0.75
-      max_aspect_ratio: 3.0
-      min_area: 0.75
-      max_area: 1.0
-      overlap_thresh: 0.0
-    }
-  }
-  optimizer {
-    momentum_optimizer: {
-      learning_rate: {
-        cosine_decay_learning_rate {
-          learning_rate_base: .02000000000000000000
-          total_steps: 200000
-          warmup_learning_rate: .00866640000000000000
-          warmup_steps: 8000
-        }
-      }
-      momentum_optimizer_value: 0.9
-    }
-    use_moving_average: false
-  }
-  max_number_of_boxes: 100
-  unpad_groundtruth_tensors: false
-}
-
-train_input_reader: {
-  tf_record_input_reader {
-    input_path: "/data/coco_train.record*"
-  }
-  label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
-}
-
-eval_config: {
-  metrics_set: "coco_detection_metrics"
-  use_moving_averages: false
-  num_examples: 8000
-}
-
-eval_input_reader: {
-  tf_record_input_reader {
-    input_path: "/data/coco_val.record*"
-  }
-  label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
-  shuffle: false
-  num_readers: 1
-}

+ 0 - 193
TensorFlow/Detection/SSD/configs/ssd320_double_4gpus.config

@@ -1,193 +0,0 @@
-# SSD with Resnet 50 v1 FPN feature extractor, shared box predictor and focal
-# loss (a.k.a Retinanet).
-# See Lin et al, https://arxiv.org/abs/1708.02002
-# Trained on COCO, initialized from Imagenet classification checkpoint
-
-model {
-  ssd {
-    inplace_batchnorm_update: true
-    freeze_batchnorm: true
-    num_classes: 90
-    box_coder {
-      faster_rcnn_box_coder {
-        y_scale: 10.0
-        x_scale: 10.0
-        height_scale: 5.0
-        width_scale: 5.0
-      }
-    }
-    matcher {
-      argmax_matcher {
-        matched_threshold: 0.5
-        unmatched_threshold: 0.5
-        ignore_thresholds: false
-        negatives_lower_than_unmatched: true
-        force_match_for_each_row: true
-        use_matmul_gather: true
-      }
-    }
-    similarity_calculator {
-      iou_similarity {
-      }
-    }
-    encode_background_as_zeros: true
-    anchor_generator {
-      multiscale_anchor_generator {
-        min_level: 3
-        max_level: 7
-        anchor_scale: 4.0
-        aspect_ratios: [1.0, 2.0, 0.5]
-        scales_per_octave: 2
-      }
-    }
-    image_resizer {
-      fixed_shape_resizer {
-        height: 320
-        width: 320
-      }
-    }
-    box_predictor {
-      weight_shared_convolutional_box_predictor {
-        depth: 256
-        class_prediction_bias_init: -4.6
-        conv_hyperparams {
-          activation: RELU_6,
-          regularizer {
-            l2_regularizer {
-              weight: 0.0004
-            }
-          }
-          initializer {
-            random_normal_initializer {
-              stddev: 0.01
-              mean: 0.0
-            }
-          }
-          batch_norm {
-            scale: true,
-            decay: 0.997,
-            epsilon: 0.001,
-          }
-        }
-        num_layers_before_predictor: 4
-        kernel_size: 3
-      }
-    }
-    feature_extractor {
-      type: 'ssd_resnet50_v1_fpn'
-      fpn {
-        min_level: 3
-        max_level: 7
-      }
-      min_depth: 16
-      depth_multiplier: 1.0
-      conv_hyperparams {
-        activation: RELU_6,
-        regularizer {
-          l2_regularizer {
-            weight: 0.0004
-          }
-        }
-        initializer {
-          truncated_normal_initializer {
-            stddev: 0.03
-            mean: 0.0
-          }
-        }
-        batch_norm {
-          scale: true,
-          decay: 0.997,
-          epsilon: 0.001,
-        }
-      }
-      override_base_feature_extractor_hyperparams: true
-    }
-    loss {
-      classification_loss {
-        weighted_sigmoid_focal {
-          alpha: 0.25
-          gamma: 2.0
-        }
-      }
-      localization_loss {
-        weighted_smooth_l1 {
-        }
-      }
-      classification_weight: 1.0
-      localization_weight: 1.0
-    }
-    normalize_loss_by_num_matches: true
-    normalize_loc_loss_by_codesize: true
-    post_processing {
-      batch_non_max_suppression {
-        score_threshold: 1e-8
-        iou_threshold: 0.6
-        max_detections_per_class: 100
-        max_total_detections: 100
-      }
-      score_converter: SIGMOID
-    }
-  }
-}
-
-train_config: {
-  fine_tune_checkpoint: "/checkpoints/resnet_v1_50/model.ckpt"
-  fine_tune_checkpoint_type: "classification"
-  batch_size: 32 
-  sync_replicas: true
-  startup_delay_steps: 0
-  replicas_to_aggregate: 8
-  num_steps: 50000
-  data_augmentation_options {
-    random_horizontal_flip {
-    }
-  }
-  data_augmentation_options {
-    random_crop_image {
-      min_object_covered: 0.0
-      min_aspect_ratio: 0.75
-      max_aspect_ratio: 3.0
-      min_area: 0.75
-      max_area: 1.0
-      overlap_thresh: 0.0
-    }
-  }
-  optimizer {
-    momentum_optimizer: {
-      learning_rate: {
-        cosine_decay_learning_rate {
-          learning_rate_base: .08000000000000000000
-          total_steps: 50000
-          warmup_learning_rate: .03466560000000000000
-          warmup_steps: 2000
-        }
-      }
-      momentum_optimizer_value: 0.9
-    }
-    use_moving_average: false
-  }
-  max_number_of_boxes: 100
-  unpad_groundtruth_tensors: false
-}
-
-train_input_reader: {
-  tf_record_input_reader {
-    input_path: "/data/coco_train.record*"
-  }
-  label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
-}
-
-eval_config: {
-  metrics_set: "coco_detection_metrics"
-  use_moving_averages: false
-  num_examples: 8000
-}
-
-eval_input_reader: {
-  tf_record_input_reader {
-    input_path: "/data/coco_val.record*"
-  }
-  label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
-  shuffle: false
-  num_readers: 1
-}

+ 0 - 193
TensorFlow/Detection/SSD/configs/ssd320_double_8gpus.config

@@ -1,193 +0,0 @@
-# SSD with Resnet 50 v1 FPN feature extractor, shared box predictor and focal
-# loss (a.k.a Retinanet).
-# See Lin et al, https://arxiv.org/abs/1708.02002
-# Trained on COCO, initialized from Imagenet classification checkpoint
-
-model {
-  ssd {
-    inplace_batchnorm_update: true
-    freeze_batchnorm: true
-    num_classes: 90
-    box_coder {
-      faster_rcnn_box_coder {
-        y_scale: 10.0
-        x_scale: 10.0
-        height_scale: 5.0
-        width_scale: 5.0
-      }
-    }
-    matcher {
-      argmax_matcher {
-        matched_threshold: 0.5
-        unmatched_threshold: 0.5
-        ignore_thresholds: false
-        negatives_lower_than_unmatched: true
-        force_match_for_each_row: true
-        use_matmul_gather: true
-      }
-    }
-    similarity_calculator {
-      iou_similarity {
-      }
-    }
-    encode_background_as_zeros: true
-    anchor_generator {
-      multiscale_anchor_generator {
-        min_level: 3
-        max_level: 7
-        anchor_scale: 4.0
-        aspect_ratios: [1.0, 2.0, 0.5]
-        scales_per_octave: 2
-      }
-    }
-    image_resizer {
-      fixed_shape_resizer {
-        height: 320
-        width: 320
-      }
-    }
-    box_predictor {
-      weight_shared_convolutional_box_predictor {
-        depth: 256
-        class_prediction_bias_init: -4.6
-        conv_hyperparams {
-          activation: RELU_6,
-          regularizer {
-            l2_regularizer {
-              weight: 0.0004
-            }
-          }
-          initializer {
-            random_normal_initializer {
-              stddev: 0.01
-              mean: 0.0
-            }
-          }
-          batch_norm {
-            scale: true,
-            decay: 0.997,
-            epsilon: 0.001,
-          }
-        }
-        num_layers_before_predictor: 4
-        kernel_size: 3
-      }
-    }
-    feature_extractor {
-      type: 'ssd_resnet50_v1_fpn'
-      fpn {
-        min_level: 3
-        max_level: 7
-      }
-      min_depth: 16
-      depth_multiplier: 1.0
-      conv_hyperparams {
-        activation: RELU_6,
-        regularizer {
-          l2_regularizer {
-            weight: 0.0004
-          }
-        }
-        initializer {
-          truncated_normal_initializer {
-            stddev: 0.03
-            mean: 0.0
-          }
-        }
-        batch_norm {
-          scale: true,
-          decay: 0.997,
-          epsilon: 0.001,
-        }
-      }
-      override_base_feature_extractor_hyperparams: true
-    }
-    loss {
-      classification_loss {
-        weighted_sigmoid_focal {
-          alpha: 0.25
-          gamma: 2.0
-        }
-      }
-      localization_loss {
-        weighted_smooth_l1 {
-        }
-      }
-      classification_weight: 1.0
-      localization_weight: 1.0
-    }
-    normalize_loss_by_num_matches: true
-    normalize_loc_loss_by_codesize: true
-    post_processing {
-      batch_non_max_suppression {
-        score_threshold: 1e-8
-        iou_threshold: 0.6
-        max_detections_per_class: 100
-        max_total_detections: 100
-      }
-      score_converter: SIGMOID
-    }
-  }
-}
-
-train_config: {
-  fine_tune_checkpoint: "/checkpoints/resnet_v1_50/model.ckpt"
-  fine_tune_checkpoint_type: "classification"
-  batch_size: 32 
-  sync_replicas: true
-  startup_delay_steps: 0
-  replicas_to_aggregate: 8
-  num_steps: 25000
-  data_augmentation_options {
-    random_horizontal_flip {
-    }
-  }
-  data_augmentation_options {
-    random_crop_image {
-      min_object_covered: 0.0
-      min_aspect_ratio: 0.75
-      max_aspect_ratio: 3.0
-      min_area: 0.75
-      max_area: 1.0
-      overlap_thresh: 0.0
-    }
-  }
-  optimizer {
-    momentum_optimizer: {
-      learning_rate: {
-        cosine_decay_learning_rate {
-          learning_rate_base: .16000000000000000000
-          total_steps: 25000
-          warmup_learning_rate: .06933120000000000000
-          warmup_steps: 1000
-        }
-      }
-      momentum_optimizer_value: 0.9
-    }
-    use_moving_average: false
-  }
-  max_number_of_boxes: 100
-  unpad_groundtruth_tensors: false
-}
-
-train_input_reader: {
-  tf_record_input_reader {
-    input_path: "/data/coco_train.record*"
-  }
-  label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
-}
-
-eval_config: {
-  metrics_set: "coco_detection_metrics"
-  use_moving_averages: false
-  num_examples: 8000
-}
-
-eval_input_reader: {
-  tf_record_input_reader {
-    input_path: "/data/coco_val.record*"
-  }
-  label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
-  shuffle: false
-  num_readers: 1
-}

+ 2 - 2
TensorFlow/Detection/SSD/configs/ssd320_full_1gpus.config

@@ -172,7 +172,7 @@ train_config: {
 
 train_input_reader: {
   tf_record_input_reader {
-    input_path: "/data/coco_train.record*"
+    input_path: "/data/*train*"
   }
   label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
 }
@@ -185,7 +185,7 @@ eval_config: {
 
 eval_input_reader: {
   tf_record_input_reader {
-    input_path: "/data/coco_val.record*"
+    input_path: "/data/*val*"
   }
   label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
   shuffle: false

+ 2 - 2
TensorFlow/Detection/SSD/configs/ssd320_full_4gpus.config

@@ -172,7 +172,7 @@ train_config: {
 
 train_input_reader: {
   tf_record_input_reader {
-    input_path: "/data/coco_train.record*"
+    input_path: "/data/*train*"
   }
   label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
 }
@@ -185,7 +185,7 @@ eval_config: {
 
 eval_input_reader: {
   tf_record_input_reader {
-    input_path: "/data/coco_val.record*"
+    input_path: "/data/*val*"
   }
   label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
   shuffle: false

+ 2 - 2
TensorFlow/Detection/SSD/configs/ssd320_full_8gpus.config

@@ -172,7 +172,7 @@ train_config: {
 
 train_input_reader: {
   tf_record_input_reader {
-    input_path: "/data/coco_train.record*"
+    input_path: "/data/*train*"
   }
   label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
 }
@@ -185,7 +185,7 @@ eval_config: {
 
 eval_input_reader: {
   tf_record_input_reader {
-    input_path: "/data/coco_val.record*"
+    input_path: "/data/*val*"
   }
   label_map_path: "object_detection/data/mscoco_label_map.pbtxt"
   shuffle: false

+ 4 - 0
TensorFlow/LanguageModeling/BERT/NOTICE

@@ -0,0 +1,4 @@
+BERT TensorFlow
+
+This repository includes software from https://github.com/google-research/bert
+licensed under the Apache License, Version 2.0 (the "License")

+ 38 - 36
TensorFlow/LanguageModeling/BERT/README.md

@@ -24,13 +24,13 @@ This repository provides a script and recipe to train BERT to achieve state of t
   * [Training accuracy results](#training-accuracy-results)
   * [Training stability test](#training-stability-test)
   * [Training performance results](#training-performance-results)
-      * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
-      * [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-8x-v100-32g)
-      * [NVIDIA DGX-2 (16x V100 32G)](#nvidia-dgx-2-16x-v100-32g)
+  * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
+  * [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-8x-v100-32g)
+  * [NVIDIA DGX-2 (16x V100 32G)](#nvidia-dgx-1-16x-v100-32g)
   * [Inference performance results](#inference-performance-results)
-      * [NVIDIA DGX-1 16G (1x V100 16G)](#nvidia-dgx-1-16g-1x-v100-16g)
-      * [NVIDIA DGX-1 32G (1x V100 32G)](#nvidia-dgx-1-32g-1x-v100-32g)
-      * [NVIDIA DGX-2 32G (1x V100 32G)](#nvidia-dgx-2-32g-1x-v100-32g)
+  * [NVIDIA DGX-1 16G (1x V100 16G)](#nvidia-dgx-1-16g-1x-v100-16g)
+  * [NVIDIA DGX-1 32G (1x V100 32G)](#nvidia-dgx-1-32g-1x-v100-32g)
+  * [NVIDIA DGX-2 32G (1x V100 32G)](#nvidia-dgx-1-32g-1x-v100-32g)
 * [Changelog](#changelog)
 * [Known issues](#known-issues)
 
@@ -120,7 +120,7 @@ After you build the container image and download the data, you can start an inte
 bash scripts/docker/launch.sh
 ```
 
-The `launch.sh` script assumes that the datasets are in the following locations by default after downloading data. 
+The `interactive.sh` script assumes that the datasets are in the following locations by default after downloading data. 
 - SQuaD v1.1 - `data/squad/v1.1`
 - BERT - `data/pretrained_models_google/uncased_L-24_H-1024_A-16`
 - Wikipedia - `data/wikipedia_corpus/final_tfrecords_sharded`
@@ -194,9 +194,9 @@ Aside from options to set hyperparameters, the relevant options to control the b
   --[no]amp: Whether to enable AMP ops.(default: 'false')
   --[no]amp_fastmath: Whether to enable AMP fasthmath ops.(default: 'false')
   --bert_config_file: The config json file corresponding to the pre-trained BERT model. This specifies the model architecture.
-  --[no]do_eval: Whether to run evaluation on the dev set.(default: 'false')
+  --[no]do_eval: Whether to run eval on the dev set.(default: 'false')
   --[no]do_train: Whether to run training.(default: 'false')
-  --eval_batch_size: Total batch size for evaluation.(default: '8')(an integer)
+  --eval_batch_size: Total batch size for eval.(default: '8')(an integer)
   --[no]fastmath: Whether to enable loss scaler for fasthmath ops.(default: 'false')
   --[no]horovod: Whether to use Horovod for multi-gpu runs(default: 'false')
   --init_checkpoint: Initial checkpoint (usually from a pre-trained BERT model).
@@ -207,7 +207,7 @@ Aside from options to set hyperparameters, the relevant options to control the b
 Aside from options to set hyperparameters, some relevant options to control the behaviour of the run_squad.py script are: 
 ```bash
   --bert_config_file: The config json file corresponding to the pre-trained BERT model. This specifies the model architecture.
-  --[no]do_predict: Whether to run evaluation on the dev set. (default: 'false')
+  --[no]do_predict: Whether to run eval on the dev set. (default: 'false')
   --[no]do_train: Whether to run training. (default: 'false')
   --learning_rate: The initial learning rate for Adam.(default: '5e-06')(a number)
   --max_answer_length: The maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.(default: '30')(an integer)
@@ -234,13 +234,15 @@ Pre-training is performed using the `run_pretraining.py` script along with param
 
 
 The `run_pretraining.sh` script runs a job on a single node  that trains the BERT-large model from scratch using the Wikipedia and Book corpus datasets as training data. By default, the training script:
-- Runs on 8 GPUs with training batch size of 14 and evaluation batch size of 8 per GPU.
-- Has FP16 precision enabled.
-- Is XLA enabled.
-- Trains with default learning rate of 1e-4 for 1144000 steps with 10000 warm-up steps.
-- Saves a checkpoint every 5000 iterations.
-- Creates a log file containing all the output.
-- Evaluates the model at the end of training. To skip evaluation, modify `--do_eval` to `False`.
+- Assumes training batch size of 14
+- Assumes evaluation batch size of 8
+- Assumes learning rate of 1e-4
+- Assumes precision of fp16_xla (fp16 math JIT compiled with XLA)
+- Assumes you want to run on 8 GPUs
+- Assumes 10,000 warmup steps
+- Assumes 1144000 training steps
+- Assumes checkpoints should be saved every 5000 steps
+- Assumes you do want to create a log file for all the output
 
 These parameters will train Wikipedia + BooksCorpus to reasonable accuracy on a DGX1 with 32GB V100 cards. If you want to match google’s best results from the BERT paper, you should either train for twice as many steps (2,288,000 steps) on a DGX1, or train on 16 GPUs on a DGX2. The DGX2 having 16 GPUs will be able to fit a batch size twice as large as a DGX1 (224 vs 112), hence the DGX2 can finish in half as many steps. 
 
@@ -251,7 +253,7 @@ run_pretraining.sh <node_type> <training_batch_size> <eval_batch_size> <learning
 ```
 
 Where:
-- <training_batch_size> per-gpu batch size used for training. Batch size varies with <precision>, larger batch sizes run more efficiently, but require more memory.
+- <training_batch_size> Batch size varies with <precision>, larger batch sizes run more efficiently, but require more memory.
 
 - <eval_batch_size> per-gpu batch size used for evaluation after training.<learning_rate> Default rate of 1e-4 is good for global batch size 256.
 
@@ -295,16 +297,16 @@ Trains BERT-large from scratch on a single DGX-2 using FP16 arithmetic. This wil
 Fine tuning is performed using the `run_squad.py` script along with parameters defined in `scripts/run_squad.sh`.
 
 The `run_squad.sh` script trains a model and performs evaluation on the SQuaD v1.1 dataset. By default, the training script: 
-- Uses 8 GPUs and batch size of 10 on each GPU.
-- Has FP16 precision enabled.
-- Is XLA enabled.
-- Runs for 2 epochs.
+- Uses 8 GPUs and batch size of 10 on each GPU
+- Has FP16 precision enabled
+- Is XLA enabled
+- Runs for 2 epochs
 - Saves a checkpoint every 1000 iterations (keeps only the latest checkpoint) and at the end of training. All checkpoints, evaluation results and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
-- Evaluation is done at the end of training. To skip evaluation, modify `--do_predict` to `False`.
+- Evaluation is done at the end of training. To skip eval, modify `--do_predict` to `False`.
 
 This script outputs checkpoints to the `/results` directory, by default, inside the container. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file. The training log contains information about:
-- Loss for the final step
-- Training and evaluation performance
+- Loss for final step
+- Train and eval performance
 - F1 and exact match score on the Dev Set of SQuaD after evaluation. 
 
 The summary after training is printed in the following format:
@@ -345,12 +347,12 @@ Inference on a fine tuned Question Answering system is performed using the `run_
 The `run_squad_inference.sh` script trains a model and performs evaluation on the SQuaD v1.1 dataset. By default, the inferencing script: 
 - Has FP16 precision enabled
 - Is XLA enabled
-- Evaluates the latest checkpoint present in `/results` with a batch size of 8
+- Does eval on latest checkpoint present in `/results` with a batch size of 8
 
 This script outputs predictions file to `/results/predictions.json` and computes F1 score and exact match score using SQuaD's `evaluate-v1.1.py`. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file. 
 
 The output log contains information about:
-- Evaluation performance
+- Eval performance
 - F1 and exact match score on the Dev Set of SQuaD after evaluation. 
 
 The summary after inference is printed in the following format:
@@ -410,14 +412,14 @@ Our results were obtained by running batch sizes up to 3x GPUs on a 16GB V100 an
 Our results were obtained by running the `scripts/run_squad.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. Performance numbers (in tokens per second) were averaged over an entire training epoch.
 
 
-| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+| **Number of GPUs** | **Batch size per GPU** | **FP 32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |:---:|:---:|:------:|:-----:|:----:|:----:|:----:|
 | 1 | 2 | 7.41 |11.86|1.6 |1.0 |1.0 |
 | 4 | 2 |23.699|35.34|1.49|3.2 |2.98|
 | 8 | 2 |44.29 |64.96|1.47|5.98|5.48|
 
 
-| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+| **Number of GPUs** | **Batch size per GPU** | **FP 32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |:---:|:---:|:-----:|:-----:|:---:|:---:|:----:|
 | 1 | 3 |  -  |14.86| - | - |1.0 |
 | 4 | 3 |  -  |44.17| - | - |2.97|
@@ -431,14 +433,14 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
 Our results were obtained by running the `scripts/run_squad.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epochs.
 
 
-| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+| **Number of GPUs** | **Batch size per GPU** | **FP 32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |---|---|-----|-----|----|----|----|
 | 1 | 4 | 8.55|18.14|2.12|1.0 |1.0 |
 | 4 | 4 |32.13|52.85|1.64|3.76|2.91|
 | 8 | 4 |62.83|95.28|1.51|7.35|5.25|
 
 
-| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+| **Number of GPUs** | **Batch size per GPU** | **FP 32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |---|---|-----|-------|---|---|----|
 | 1 | 10|  -  | 27.69 | - | - |1.0 |
 | 4 | 10|  -  | 85.193| - | - |3.07|
@@ -453,7 +455,7 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
 Our results were obtained by running the `scripts/run_squad.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
 
 
-| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+| **Number of GPUs** | **Batch size per GPU** | **FP 32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |---|---|------|------|----|-----|----|
 |  1| 4 |  8.80| 17.43|1.98| 1.0 |1.0 |
 |  4| 4 | 33.22| 56.87|1.71| 3.78|3.26|
@@ -461,7 +463,7 @@ Our results were obtained by running the `scripts/run_squad.sh` training script
 | 16| 4 |117.83|162.29|1.38|13.39|9.31|
 
 
-| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+| **Number of GPUs** | **Batch size per GPU** | **FP 32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |---|---|---|------|---|---|----|
 |  1| 10| - | 28.72| - | - |1.0 |
 |  4| 10| - | 92.73| - | - |3.22|
@@ -477,7 +479,7 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
 #### NVIDIA DGX-1 16G (1x V100 16G)
 Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 1x V100 16G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
 
-| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
+| **Number of GPUs** | **Batch size per GPU** | **FP 32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
 |---|---|-----|------|----|
 | 1 | 8 |41.04|112.55|2.74|
 
@@ -487,7 +489,7 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
 #### NVIDIA DGX-1 32G (1x V100 32G)
 Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 1x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
 
-| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
+| **Number of GPUs** | **Batch size per GPU** | **FP 32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
 |---|---|-----|------|----|
 | 1 | 8 |36.78|118.54|3.22|
 
@@ -496,7 +498,7 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
 #### NVIDIA DGX-2 32G (1x V100 32G)
 Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-2 with 1x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
 
-| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
+| **Number of GPUs** | **Batch size per GPU** | **FP 32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
 |---|---|-----|------|----|
 | 1 | 8 |33.95|108.45|3.19|
 

+ 43 - 28
TensorFlow/LanguageModeling/BERT/run_squad.py

@@ -28,7 +28,7 @@ import optimization
 import tokenization
 import six
 import tensorflow as tf
-
+import horovod.tensorflow as hvd
 flags = tf.flags
 
 FLAGS = flags.FLAGS
@@ -90,6 +90,7 @@ flags.DEFINE_integer("predict_batch_size", 8,
 
 flags.DEFINE_float("learning_rate", 5e-6, "The initial learning rate for Adam.")
 
+flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
 flags.DEFINE_float("num_train_epochs", 3.0,
                    "Total number of training epochs to perform.")
 
@@ -154,7 +155,6 @@ flags.DEFINE_float(
     "If null_score - best_non_null is greater than the threshold predict null.")
 
 flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
-
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
 
 # report samples/sec, total loss and learning rate during training
@@ -463,7 +463,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
         start_position = 0
         end_position = 0
 
-      if example_index < 20:
+      if FLAGS.verbose_logging and example_index < 20:
         tf.logging.info("*** Example ***")
         tf.logging.info("unique_id: %s" % (unique_id))
         tf.logging.info("example_index: %s" % (example_index))
@@ -593,7 +593,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
       input_mask=input_mask,
       token_type_ids=segment_ids,
       use_one_hot_embeddings=use_one_hot_embeddings,
-      compute_type=tf.float16 if FLAGS.use_fp16 else tf.float32)
+      compute_type=tf.float32)
 
   final_hidden = model.get_sequence_output()
 
@@ -631,10 +631,10 @@ def model_fn_builder(bert_config, init_checkpoint, learning_rate,
 
   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
     """The `model_fn` for TPUEstimator."""
-
-    tf.logging.info("*** Features ***")
-    for name in sorted(features.keys()):
-      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+    if FLAGS.verbose_logging:
+        tf.logging.info("*** Features ***")
+        for name in sorted(features.keys()):
+          tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
 
     unique_ids = features["unique_ids"]
     input_ids = features["input_ids"]
@@ -655,7 +655,7 @@ def model_fn_builder(bert_config, init_checkpoint, learning_rate,
 
     initialized_variable_names = {}
     scaffold_fn = None
-    if init_checkpoint:
+    if init_checkpoint and (hvd is None or hvd.rank() == 0):
       (assignment_map, initialized_variable_names
       ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
       if use_tpu:
@@ -667,14 +667,16 @@ def model_fn_builder(bert_config, init_checkpoint, learning_rate,
         scaffold_fn = tpu_scaffold
       else:
         tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+    
+    if FLAGS.verbose_logging:
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+          init_string = ""
+          if var.name in initialized_variable_names:
+            init_string = ", *INIT_FROM_CKPT*"
+          tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape,
+                          init_string)
 
-    tf.logging.info("**** Trainable Variables ****")
-    for var in tvars:
-      init_string = ""
-      if var.name in initialized_variable_names:
-        init_string = ", *INIT_FROM_CKPT*"
-      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
-                      init_string)
 
     output_spec = None
     if mode == tf.estimator.ModeKeys.TRAIN:
@@ -721,7 +723,7 @@ def model_fn_builder(bert_config, init_checkpoint, learning_rate,
   return model_fn
 
 
-def input_fn_builder(input_file, seq_length, is_training, drop_remainder):
+def input_fn_builder(input_file, seq_length, is_training, drop_remainder, hvd=None):
   """Creates an `input_fn` closure to be passed to TPUEstimator."""
 
   name_to_features = {
@@ -751,14 +753,20 @@ def input_fn_builder(input_file, seq_length, is_training, drop_remainder):
 
   def input_fn(params):
     """The actual input function."""
+
     batch_size = params["batch_size"]
 
     # For training, we want a lot of parallel reading and shuffling.
     # For eval, we want no shuffling and parallel reading doesn't matter.
-    d = tf.data.TFRecordDataset(input_file)
     if is_training:
-      d = d.repeat()
-      d = d.shuffle(buffer_size=100)
+        d = tf.data.TFRecordDataset(input_file, num_parallel_reads=4)
+        if hvd is not None: d = d.shard(hvd.size(), hvd.rank())
+        d = d.apply(tf.data.experimental.ignore_errors())
+        d = d.shuffle(buffer_size=100)
+        d = d.repeat()
+    else:
+        d = tf.data.TFRecordDataset(input_file)
+
 
     d = d.apply(
         tf.contrib.data.map_and_batch(
@@ -771,6 +779,7 @@ def input_fn_builder(input_file, seq_length, is_training, drop_remainder):
   return input_fn
 
 
+
 RawResult = collections.namedtuple("RawResult",
                                    ["unique_id", "start_logits", "end_logits"])
 
@@ -1163,6 +1172,9 @@ def validate_flags_or_throw(bert_config):
 def main(_):
   tf.logging.set_verbosity(tf.logging.INFO)
 
+  if FLAGS.horovod:
+    hvd.init()
+
   bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
 
   validate_flags_or_throw(bert_config)
@@ -1203,7 +1215,7 @@ def main(_):
   run_config = tf.contrib.tpu.RunConfig(
       cluster=tpu_cluster_resolver,
       master=FLAGS.master,
-      model_dir=FLAGS.output_dir,
+      model_dir=FLAGS.output_dir if master_process else None,
       session_config=config,
       save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
       keep_checkpoint_max=1,
@@ -1221,7 +1233,7 @@ def main(_):
     train_examples = read_squad_examples(
         input_file=FLAGS.train_file, is_training=True)
     num_train_steps = int(
-        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
+        len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
     num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
 
     # Pre-shuffle the input to avoid having to make a very large shuffle
@@ -1248,7 +1260,7 @@ def main(_):
   model_fn = model_fn_builder(
       bert_config=bert_config,
       init_checkpoint=FLAGS.init_checkpoint,
-      learning_rate=FLAGS.learning_rate,
+      learning_rate=learning_rate,
       num_train_steps=num_train_steps,
       num_warmup_steps=num_warmup_steps,
       use_tpu=FLAGS.use_tpu,
@@ -1273,7 +1285,7 @@ def main(_):
         filename=tmp_filenames[hvd_rank],
         is_training=True)
     convert_examples_to_features(
-        examples=train_examples,
+        examples=train_examples[start_index:end_index],
         tokenizer=tokenizer,
         max_seq_length=FLAGS.max_seq_length,
         doc_stride=FLAGS.doc_stride,
@@ -1287,10 +1299,15 @@ def main(_):
     tf.logging.info("  Num split examples = %d", train_writer.num_features)
     tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
     tf.logging.info("  Num steps = %d", num_train_steps)
+    tf.logging.info("  LR = %f", learning_rate)
     del train_examples
+    if FLAGS.horovod:
+        barrier = hvd.allreduce(tf.constant(0))
+        with tf.Session(config=config) as sess:
+          sess.run(barrier)
 
     train_input_fn = input_fn_builder(
-        input_file=train_writer.filename,
+        input_file=tmp_filenames,
         seq_length=FLAGS.max_seq_length,
         is_training=True,
         drop_remainder=True,
@@ -1310,7 +1327,7 @@ def main(_):
         tf.logging.info("%d Training Performance = %0.4f sentences/sec", hvd_rank, avg_sentences_per_second)
         tf.logging.info("-----------------------------")
 
-  if FLAGS.do_predict:
+  if FLAGS.do_predict and master_process:
     eval_examples = read_squad_examples(
         input_file=FLAGS.predict_file, is_training=False)
 
@@ -1338,8 +1355,6 @@ def main(_):
     tf.logging.info("  Num split examples = %d", len(eval_features))
     tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
 
-    all_results = []
-
     predict_input_fn = input_fn_builder(
         input_file=eval_writer.filename,
         seq_length=FLAGS.max_seq_length,