run_pretraining_lamb_phase2.sh 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #! /bin/bash
  2. # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. echo "Container nvidia build = " $NVIDIA_BUILD_ID
  15. train_batch_size_phase1=${1:-64}
  16. train_batch_size_phase2=${2:-8}
  17. eval_batch_size=${3:-8}
  18. learning_rate_phase1=${4:-"7.5e-4"}
  19. learning_rate_phase2=${5:-"5e-4"}
  20. precision=${6:-"fp16"}
  21. use_xla=${7:-"true"}
  22. num_gpus=${8:-8}
  23. warmup_steps_phase1=${9:-"2000"}
  24. warmup_steps_phase2=${10:-"200"}
  25. train_steps=${11:-7820}
  26. save_checkpoints_steps=${12:-100}
  27. num_accumulation_steps_phase1=${13:-128}
  28. num_accumulation_steps_phase2=${14:-512}
  29. bert_model=${15:-"large"}
  30. DATA_DIR=${DATA_DIR:-data}
  31. #Edit to save logs & checkpoints in a different directory
  32. RESULTS_DIR=${RESULTS_DIR:-/results}
  33. if [ "$bert_model" = "large" ] ; then
  34. export BERT_CONFIG=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb/bert_config.json
  35. else
  36. export BERT_CONFIG=data/download/nvidia_pretrained/bert_tf_squad11_base_128/bert_config.json
  37. fi
  38. echo "Container nvidia build = " $NVIDIA_BUILD_ID
  39. PREC=""
  40. if [ "$precision" = "fp16" ] ; then
  41. PREC="--amp"
  42. elif [ "$precision" = "fp32" ] ; then
  43. PREC="--noamp"
  44. elif [ "$precision" = "tf32" ] ; then
  45. PREC="--noamp"
  46. elif [ "$precision" = "manual_fp16" ] ; then
  47. PREC="--noamp --manual_fp16"
  48. else
  49. echo "Unknown <precision> argument"
  50. exit -2
  51. fi
  52. if [ "$use_xla" = "true" ] ; then
  53. PREC="$PREC --use_xla"
  54. echo "XLA activated"
  55. else
  56. PREC="$PREC --nouse_xla"
  57. fi
  58. mpi=""
  59. horovod_str=""
  60. if [ $num_gpus -gt 1 ] ; then
  61. mpi="mpiexec --allow-run-as-root -np $num_gpus --bind-to socket"
  62. horovod_str="--horovod"
  63. fi
  64. #PHASE 1 Config
  65. train_steps_phase1=$(expr $train_steps \* 9 \/ 10) #Phase 1 is 10% of training
  66. gbs_phase1=$(expr $train_batch_size_phase1 \* $num_accumulation_steps_phase1)
  67. PHASE1_CKPT=${RESULTS_DIR}/phase_1/model.ckpt-${train_steps_phase1}
  68. #PHASE 2
  69. seq_len=512
  70. max_pred_per_seq=80
  71. train_steps_phase2=$(expr $train_steps \* 1 \/ 10) #Phase 2 is 10% of training
  72. gbs_phase2=$(expr $train_batch_size_phase2 \* $num_accumulation_steps_phase2)
  73. train_steps_phase2=$(expr $train_steps_phase2 \* $gbs_phase1 \/ $gbs_phase2) # Adjust for batch size
  74. RESULTS_DIR_PHASE2=${RESULTS_DIR}/phase_2
  75. mkdir -m 777 -p $RESULTS_DIR_PHASE2
  76. INPUT_FILES="$DATA_DIR/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pred_per_seq}_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training"
  77. EVAL_FILES="$DATA_DIR/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pred_per_seq}_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/test"
  78. #Check if all necessary files are available before training
  79. for DIR_or_file in $DATA_DIR $RESULTS_DIR $BERT_CONFIG ${PHASE1_CKPT}.meta; do
  80. if [ ! -d "$DIR_or_file" ] && [ ! -f "$DIR_or_file" ]; then
  81. echo "Error! $DIR_or_file directory missing. Please mount correctly"
  82. exit -1
  83. fi
  84. done
  85. $mpi python /workspace/bert/run_pretraining.py \
  86. --input_files_dir=$INPUT_FILES \
  87. --init_checkpoint=$PHASE1_CKPT \
  88. --eval_files_dir=$EVAL_FILES \
  89. --output_dir=$RESULTS_DIR_PHASE2 \
  90. --bert_config_file=$BERT_CONFIG \
  91. --do_train=True \
  92. --do_eval=True \
  93. --train_batch_size=$train_batch_size_phase2 \
  94. --eval_batch_size=$eval_batch_size \
  95. --max_seq_length=$seq_len \
  96. --max_predictions_per_seq=$max_pred_per_seq \
  97. --num_train_steps=$train_steps_phase2 \
  98. --num_accumulation_steps=$num_accumulation_steps_phase2 \
  99. --num_warmup_steps=$warmup_steps_phase2 \
  100. --save_checkpoints_steps=$save_checkpoints_steps \
  101. --learning_rate=$learning_rate_phase2 \
  102. $horovod_str $PREC \
  103. --allreduce_post_accumulation=True