run_pretraining.sh 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. #!/bin/bash
  2. # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. echo "Container nvidia build = " $NVIDIA_BUILD_ID
  15. train_batch_size=${1:-8192}
  16. learning_rate=${2:-"6e-3"}
  17. precision=${3:-"fp16"}
  18. num_gpus=${4:-8}
  19. warmup_proportion=${5:-"0.2843"}
  20. train_steps=${6:-7038}
  21. save_checkpoint_steps=${7:-200}
  22. resume_training=${8:-"false"}
  23. create_logfile=${9:-"true"}
  24. accumulate_gradients=${10:-"true"}
  25. gradient_accumulation_steps=${11:-128}
  26. seed=${12:-$RANDOM}
  27. job_name=${13:-"bert_lamb_pretraining"}
  28. allreduce_post_accumulation=${14:-"true"}
  29. allreduce_post_accumulation_fp16=${15:-"true"}
  30. accumulate_into_fp16=${16:-"false"}
  31. train_batch_size_phase2=${1:-4096}
  32. learning_rate_phase2=${2:-"4e-3"}
  33. warmup_proportion_phase2=${5:-"0.128"}
  34. train_steps_phase2=${6:-1563}
  35. gradient_accumulation_steps_phase2=${11:-512}
  36. DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus # change this for other datasets
  37. DATA_DIR=$BERT_PREP_WORKING_DIR/${DATASET}/
  38. BERT_CONFIG=bert_config.json
  39. RESULTS_DIR=/workspace/bert/results
  40. CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
  41. mkdir -p $CHECKPOINTS_DIR
  42. if [ ! -d "$DATA_DIR" ] ; then
  43. echo "Warning! $DATA_DIR directory missing. Training cannot start"
  44. fi
  45. if [ ! -d "$RESULTS_DIR" ] ; then
  46. echo "Error! $RESULTS_DIR directory missing."
  47. exit -1
  48. fi
  49. if [ ! -d "$CHECKPOINTS_DIR" ] ; then
  50. echo "Warning! $CHECKPOINTS_DIR directory missing."
  51. echo "Checkpoints will be written to $RESULTS_DIR instead."
  52. CHECKPOINTS_DIR=$RESULTS_DIR
  53. fi
  54. if [ ! -f "$BERT_CONFIG" ] ; then
  55. echo "Error! BERT large configuration file not found at $BERT_CONFIG"
  56. exit -1
  57. fi
  58. PREC=""
  59. if [ "$precision" = "fp16" ] ; then
  60. PREC="--fp16"
  61. elif [ "$precision" = "fp32" ] ; then
  62. PREC=""
  63. else
  64. echo "Unknown <precision> argument"
  65. exit -2
  66. fi
  67. ACCUMULATE_GRADIENTS=""
  68. if [ "$accumulate_gradients" == "true" ] ; then
  69. ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps"
  70. fi
  71. CHECKPOINT=""
  72. if [ "$resume_training" == "true" ] ; then
  73. CHECKPOINT="--resume_from_checkpoint"
  74. fi
  75. ALL_REDUCE_POST_ACCUMULATION=""
  76. if [ "$allreduce_post_accumulation" == "true" ] ; then
  77. ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
  78. fi
  79. ALL_REDUCE_POST_ACCUMULATION_FP16=""
  80. if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
  81. ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
  82. fi
  83. ACCUMULATE_INTO_FP16=""
  84. if [ "$accumulate_into_fp16" == "true" ] ; then
  85. ACCUMULATE_INTO_FP16="--accumulate_into_fp16"
  86. fi
  87. echo $DATA_DIR
  88. INPUT_DIR=$DATA_DIR
  89. CMD=" /workspace/bert/run_pretraining.py"
  90. CMD+=" --input_dir=$DATA_DIR"
  91. CMD+=" --output_dir=$CHECKPOINTS_DIR"
  92. CMD+=" --config_file=$BERT_CONFIG"
  93. CMD+=" --bert_model=bert-large-uncased"
  94. CMD+=" --train_batch_size=$train_batch_size"
  95. CMD+=" --max_seq_length=128"
  96. CMD+=" --max_predictions_per_seq=20"
  97. CMD+=" --max_steps=$train_steps"
  98. CMD+=" --warmup_proportion=$warmup_proportion"
  99. CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
  100. CMD+=" --learning_rate=$learning_rate"
  101. CMD+=" --seed=$seed"
  102. CMD+=" $PREC"
  103. CMD+=" $ACCUMULATE_GRADIENTS"
  104. CMD+=" $CHECKPOINT"
  105. CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
  106. CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
  107. CMD+=" $ACCUMULATE_INTO_FP16"
  108. CMD+=" --do_train"
  109. if [ "$num_gpus" -gt 1 ] ; then
  110. CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
  111. else
  112. CMD="python3 $CMD"
  113. fi
  114. if [ "$create_logfile" = "true" ] ; then
  115. export GBS=$(expr $train_batch_size \* $num_gpus)
  116. printf -v TAG "pyt_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS
  117. DATESTAMP=`date +'%y%m%d%H%M%S'`
  118. LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
  119. printf "Logs written to %s\n" "$LOGFILE"
  120. fi
  121. set -x
  122. if [ -z "$LOGFILE" ] ; then
  123. $CMD
  124. else
  125. (
  126. $CMD
  127. ) |& tee $LOGFILE
  128. fi
  129. set +x
  130. echo "finished pretraining, starting benchmarking"
  131. target_loss=15
  132. THROUGHPUT=10
  133. THRESHOLD=0.9
  134. throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
  135. loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
  136. final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
  137. train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size')}')
  138. echo " training throughput phase1: $train_perf sequences/second"
  139. echo "average loss: $loss"
  140. echo "final loss: $final_loss"
  141. #Start Phase2
  142. DATASET=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus # change this for other datasets
  143. DATA_DIR=$BERT_PREP_WORKING_DIR/${DATASET}/
  144. PREC=""
  145. if [ "$precision" = "fp16" ] ; then
  146. PREC="--fp16"
  147. elif [ "$precision" = "fp32" ] ; then
  148. PREC=""
  149. else
  150. echo "Unknown <precision> argument"
  151. exit -2
  152. fi
  153. ACCUMULATE_GRADIENTS=""
  154. if [ "$accumulate_gradients" == "true" ] ; then
  155. ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2"
  156. fi
  157. ALL_REDUCE_POST_ACCUMULATION=""
  158. if [ "$allreduce_post_accumulation" == "true" ] ; then
  159. ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
  160. fi
  161. ALL_REDUCE_POST_ACCUMULATION_FP16=""
  162. if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
  163. ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
  164. fi
  165. ACCUMULATE_INTO_FP16=""
  166. if [ "$accumulate_into_fp16" == "true" ] ; then
  167. ACCUMULATE_INTO_FP16="--accumulate_into_fp16"
  168. fi
  169. echo $DATA_DIR
  170. INPUT_DIR=$DATA_DIR
  171. CMD=" /workspace/bert/run_pretraining.py"
  172. CMD+=" --input_dir=$DATA_DIR"
  173. CMD+=" --output_dir=$CHECKPOINTS_DIR"
  174. CMD+=" --config_file=$BERT_CONFIG"
  175. CMD+=" --bert_model=bert-large-uncased"
  176. CMD+=" --train_batch_size=$train_batch_size_phase2"
  177. CMD+=" --max_seq_length=512"
  178. CMD+=" --max_predictions_per_seq=80"
  179. CMD+=" --max_steps=$train_steps_phase2"
  180. CMD+=" --warmup_proportion=$warmup_proportion_phase2"
  181. CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
  182. CMD+=" --learning_rate=$learning_rate_phase2"
  183. CMD+=" --seed=$seed"
  184. CMD+=" $PREC"
  185. CMD+=" $ACCUMULATE_GRADIENTS"
  186. CMD+=" $CHECKPOINT"
  187. CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
  188. CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
  189. CMD+=" $ACCUMULATE_INTO_FP16"
  190. CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
  191. if [ "$num_gpus" -gt 1 ] ; then
  192. CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
  193. else
  194. CMD="python3 $CMD"
  195. fi
  196. if [ "$create_logfile" = "true" ] ; then
  197. export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
  198. printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
  199. DATESTAMP=`date +'%y%m%d%H%M%S'`
  200. LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
  201. printf "Logs written to %s\n" "$LOGFILE"
  202. fi
  203. set -x
  204. if [ -z "$LOGFILE" ] ; then
  205. $CMD
  206. else
  207. (
  208. $CMD
  209. ) |& tee $LOGFILE
  210. fi
  211. set +x
  212. echo "finished phase2"
  213. throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
  214. loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
  215. final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
  216. train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size_phase2')}')
  217. echo " training throughput phase2: $train_perf sequences/second"
  218. echo "average loss: $loss"
  219. echo "final loss: $final_loss"