run_pretraining_adam.sh 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #! /bin/bash
  2. # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. echo "Container nvidia build = " $NVIDIA_BUILD_ID
  15. train_batch_size=${1:-16}
  16. eval_batch_size=${2:-8}
  17. learning_rate=${3:-"1e-4"}
  18. precision=${4:-"fp16"}
  19. use_xla=${5:-"true"}
  20. num_gpus=${6:-8}
  21. warmup_steps=${7:-"10000"}
  22. train_steps=${8:-1144000}
  23. save_checkpoints_steps=${9:-5000}
  24. bert_model=${10:-"large"}
  25. num_accumulation_steps=${11:-1}
  26. seq_len=${12:-512}
  27. max_pred_per_seq=${13:-80}
  28. DATA_DIR=data/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pred_per_seq}_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus
  29. if [ "$bert_model" = "large" ] ; then
  30. export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
  31. else
  32. export BERT_DIR=data/download/nvidia_pretrained/bert_tf_squad11_base_128
  33. fi
  34. PREC=""
  35. if [ "$precision" = "fp16" ] ; then
  36. PREC="--amp"
  37. elif [ "$precision" = "fp32" ] ; then
  38. PREC="--noamp"
  39. elif [ "$precision" = "tf32" ] ; then
  40. PREC="--noamp"
  41. elif [ "$precision" = "manual_fp16" ] ; then
  42. PREC="--noamp --manual_fp16"
  43. else
  44. echo "Unknown <precision> argument"
  45. exit -2
  46. fi
  47. if [ "$use_xla" = "true" ] ; then
  48. PREC="$PREC --use_xla"
  49. echo "XLA activated"
  50. else
  51. PREC="$PREC --nouse_xla"
  52. fi
  53. export GBS=$(expr $train_batch_size \* $num_gpus \* $num_accumulation_steps)
  54. printf -v TAG "tf_bert_pretraining_adam_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
  55. DATESTAMP=`date +'%y%m%d%H%M%S'`
  56. #Edit to save logs & checkpoints in a different directory
  57. RESULTS_DIR=${RESULTS_DIR:-/results/${TAG}_${DATESTAMP}}
  58. LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
  59. mkdir -m 777 -p $RESULTS_DIR
  60. printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
  61. printf "Logs written to %s\n" "$LOGFILE"
  62. INPUT_FILES="$DATA_DIR/training"
  63. EVAL_FILES="$DATA_DIR/test"
  64. horovod_str=""
  65. mpi=""
  66. if [ $num_gpus -gt 1 ] ; then
  67. mpi="mpiexec --allow-run-as-root -np $num_gpus --bind-to socket"
  68. horovod_str="--horovod"
  69. fi
  70. CMD="$mpi python3 /workspace/bert/run_pretraining.py"
  71. CMD+=" --input_files_dir=$INPUT_FILES"
  72. CMD+=" --eval_files_dir=$EVAL_FILES"
  73. CMD+=" --output_dir=$RESULTS_DIR"
  74. CMD+=" --bert_config_file=$BERT_CONFIG"
  75. CMD+=" --do_train=True"
  76. CMD+=" --do_eval=True"
  77. CMD+=" --train_batch_size=$train_batch_size"
  78. CMD+=" --eval_batch_size=$eval_batch_size"
  79. CMD+=" --max_seq_length=$seq_len"
  80. CMD+=" --max_predictions_per_seq=$max_pred_per_seq"
  81. CMD+=" --num_train_steps=$train_steps"
  82. CMD+=" --num_warmup_steps=$warmup_steps"
  83. CMD+=" --num_accumulation_steps=$num_accumulation_steps"
  84. CMD+=" --save_checkpoints_steps=$save_checkpoints_steps"
  85. CMD+=" --learning_rate=$learning_rate"
  86. CMD+=" --optimizer_type=adam"
  87. CMD+=" $horovod_str $PREC"
  88. CMD+=" --allreduce_post_accumulation=True"
  89. #Check if all necessary files are available before training
  90. for DIR_or_file in $DATA_DIR $BERT_CONFIG $RESULTS_DIR; do
  91. if [ ! -d "$DIR_or_file" ] && [ ! -f "$DIR_or_file" ]; then
  92. echo "Error! $DIR_or_file directory missing. Please mount correctly"
  93. exit -1
  94. fi
  95. done
  96. set -x
  97. if [ -z "$LOGFILE" ] ; then
  98. $CMD
  99. else
  100. (
  101. $CMD
  102. ) |& tee $LOGFILE
  103. fi
  104. set +x