run_squad.sh 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. echo "Container nvidia build = " $NVIDIA_BUILD_ID
  15. init_checkpoint=${1:-"checkpoints/squad"}
  16. epochs=${2:-"2"}
  17. batch_size=${3:-"32"}
  18. learning_rate=${4:-"4.6e-5"}
  19. warmup_proportion=${5:-"0.2"}
  20. precision=${6:-"amp"}
  21. num_gpus=${7:-"8"}
  22. seed=${8:-"1"}
  23. squad_dir=${9:-"$BERT_PREP_WORKING_DIR/download/squad/v1.1"}
  24. vocab_file=${10:-"vocab/bert-large-uncased-vocab.txt"}
  25. OUT_DIR=${11:-"/results"}
  26. mode=${12:-"train_eval"}
  27. CONFIG_FILE=${13:-"None"}
  28. max_steps=${14:-"-1"}
  29. enable_benchmark=${15:-"false"}
  30. benchmark_steps=${16:-"100"}
  31. benchmark_warmup_steps=${17:-"100"}
  32. fuse_mha=${18:-"true"}
  33. echo "out dir is $OUT_DIR"
  34. mkdir -p $OUT_DIR
  35. if [ ! -d "$OUT_DIR" ]; then
  36. echo "ERROR: non existing $OUT_DIR"
  37. exit 1
  38. fi
  39. amp=""
  40. FUSE_MHA=""
  41. if [ "$precision" = "amp" ] ; then
  42. echo "amp activated!"
  43. amp=" --amp --use-dynamic-loss-scaling --scale-loss=128.0"
  44. if [ "$fuse_mha" = "true" ] ; then
  45. FUSE_MHA="--fuse-mha"
  46. fi
  47. fi
  48. CONFIG=""
  49. if [ "$CONFIG_FILE" != "None" ] ; then
  50. CONFIG="--config-file=$CONFIG_FILE"
  51. fi
  52. BENCH=""
  53. if [ "$enable_benchmark" = "true" ] ; then
  54. BENCH="--benchmark --benchmark-steps=$benchmark_steps --benchmark-warmup-steps=$benchmark_warmup_steps"
  55. fi
  56. unset CUDA_VISIBLE_DEVICES
  57. if [ "$num_gpus" = "1" ] ; then
  58. CMD="python -m paddle.distributed.launch --gpus=0"
  59. elif [ "$num_gpus" = "2" ] ; then
  60. CMD="python -m paddle.distributed.launch --gpus=0,1"
  61. elif [ "$num_gpus" = "3" ] ; then
  62. CMD="python -m paddle.distributed.launch --gpus=0,1,2"
  63. elif [ "$num_gpus" = "4" ] ; then
  64. CMD="python -m paddle.distributed.launch --gpus=0,1,2,3"
  65. elif [ "$num_gpus" = "5" ] ; then
  66. CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4"
  67. elif [ "$num_gpus" = "6" ] ; then
  68. CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5"
  69. elif [ "$num_gpus" = "7" ] ; then
  70. CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6"
  71. elif [ "$num_gpus" = "8" ] ; then
  72. CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7"
  73. else
  74. echo "Wrong number of gpus"
  75. exit 2
  76. fi
  77. CMD+=" run_squad.py "
  78. CMD+="--from-pretrained-params=$init_checkpoint "
  79. if [ "$mode" = "train" ] ; then
  80. CMD+="--do-train "
  81. CMD+="--train-file=$squad_dir/train-v1.1.json "
  82. CMD+="--train-batch-size=$batch_size "
  83. elif [ "$mode" = "eval" ] ; then
  84. CMD+="--do-predict "
  85. CMD+="--predict-file=$squad_dir/dev-v1.1.json "
  86. CMD+="--predict-batch-size=$batch_size "
  87. CMD+="--eval-script=$squad_dir/evaluate-v1.1.py "
  88. CMD+="--do-eval "
  89. elif [ "$mode" = "prediction" ] ; then
  90. CMD+="--do-predict "
  91. CMD+="--predict-file=$squad_dir/dev-v1.1.json "
  92. CMD+="--predict-batch-size=$batch_size "
  93. else
  94. CMD+=" --do-train "
  95. CMD+=" --train-file=$squad_dir/train-v1.1.json "
  96. CMD+=" --train-batch-size=$batch_size "
  97. CMD+="--do-predict "
  98. CMD+="--predict-file=$squad_dir/dev-v1.1.json "
  99. CMD+="--predict-batch-size=$batch_size "
  100. CMD+="--eval-script=$squad_dir/evaluate-v1.1.py "
  101. CMD+="--do-eval "
  102. fi
  103. CMD+=" --do-lower-case "
  104. CMD+=" --bert-model=bert-large-uncased "
  105. CMD+=" --learning-rate=$learning_rate "
  106. CMD+=" --seed=$seed "
  107. CMD+=" --epochs=$epochs "
  108. CMD+=" --max-seq-length=384 "
  109. CMD+=" --doc-stride=128 "
  110. CMD+=" --output-dir=$OUT_DIR "
  111. CMD+=" --vocab-file=$vocab_file "
  112. CMD+=" $CONFIG "
  113. CMD+=" --max-steps=$max_steps "
  114. CMD+=" --optimizer=AdamW "
  115. CMD+=" --log-freq=100 "
  116. CMD+=" $amp "
  117. CMD+=" $FUSE_MHA "
  118. CMD+=" $BENCH "
  119. CMD+=" --report-file $OUT_DIR/dllogger_${num_gpus}_${precision}.json "
  120. LOGFILE=$OUT_DIR/logfile.txt
  121. echo "$CMD |& tee $LOGFILE"
  122. time $CMD |& tee $LOGFILE