finetune_train_benchmark.sh 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. #!/bin/bash
  2. # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. bert_model=${1:-"large"}
  15. use_xla=${2:-"true"}
  16. num_gpu=${3:-"8"}
  17. task=${4:-"squad"}
  18. if [ "$bert_model" = "large" ] ; then
  19. export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
  20. else
  21. export BERT_DIR=data/download/nvidia_pretrained/bert_tf_squad11_base_128
  22. fi
  23. echo "BERT directory set as " $BERT_DIR
  24. init_checkpoint="$BERT_DIR/bert_model.ckpt"
  25. learning_rate=5e-6
  26. #Edit to save logs & checkpoints in a different directory
  27. RESULTS_DIR=/results
  28. if [ ! -d "$RESULTS_DIR" ] ; then
  29. echo "Error! $RESULTS_DIR directory missing."
  30. exit -1
  31. fi
  32. echo "Results directory set as " $RESULTS_DIR
  33. if [ "$use_xla" = "true" ] ; then
  34. use_xla_tag="--use_xla"
  35. else
  36. use_xla_tag="--nouse_xla"
  37. fi
  38. if [ $num_gpu -gt 1 ] ; then
  39. mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
  40. --allow-run-as-root -bind-to none -map-by slot \
  41. -x NCCL_DEBUG=INFO \
  42. -x LD_LIBRARY_PATH \
  43. -x PATH -mca pml ob1 -mca btl ^openib"
  44. use_hvd="--horovod"
  45. else
  46. mpi_command=""
  47. use_hvd=""
  48. fi
  49. LOGFILE="${RESULTS_DIR}/${task}_training_benchmark_bert_${bert_model}_gpu_${num_gpu}.log"
  50. if [ "$task" = "squad" ] ; then
  51. export SQUAD_DIR=data/download/squad/v1.1
  52. epochs="2.0"
  53. echo "Squad directory set as " $SQUAD_DIR
  54. echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
  55. echo "Precision Sequence Length Batch size Performance(sent/sec)" >> $LOGFILE
  56. for seq_len in 128 384; do
  57. if [ "$seq_len" = "128" ] ; then
  58. doc_stride=64
  59. else
  60. doc_stride=128
  61. fi
  62. for batch_size in 1 2 4; do
  63. for use_fp16 in "--amp" "--noamp"; do
  64. res_dir=${RESULTS_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_len}_prec_${use_fp16}_bs_${batch_size}
  65. mkdir -p $res_dir
  66. tmp_file="${res_dir}/${task}_training_benchmark.log"
  67. $mpi_command python run_squad.py \
  68. --vocab_file=$BERT_DIR/vocab.txt \
  69. --bert_config_file=$BERT_DIR/bert_config.json \
  70. --init_checkpoint=$init_checkpoint \
  71. --do_train=True \
  72. --train_file=$SQUAD_DIR/train-v1.1.json \
  73. --train_batch_size=$batch_size \
  74. --learning_rate=$learning_rate \
  75. --num_train_epochs=$epochs \
  76. --max_seq_length=$seq_len \
  77. --doc_stride=$doc_stride \
  78. --output_dir=$res_dir \
  79. "$use_hvd" \
  80. "$use_fp16" \
  81. $use_xla_tag |& tee $tmp_file
  82. perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
  83. echo "$use_fp16 $seq_len $batch_size $perf" >> $LOGFILE
  84. done
  85. done
  86. done
  87. else
  88. echo "Benchmarking for " $task "currently not supported. Sorry!"
  89. fi