run_squad.sh 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. #!/usr/bin/env bash
  2. # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. echo "Container nvidia build = " $NVIDIA_BUILD_ID
  15. init_checkpoint=${1:-"/workspace/bert/checkpoints/bert_uncased.pt"}
  16. epochs=${2:-"2.0"}
  17. batch_size=${3:-"4"}
  18. learning_rate=${4:-"3e-5"}
  19. precision=${5:-"fp16"}
  20. num_gpu=${6:-"8"}
  21. seed=${7:-"1"}
  22. squad_dir=${8:-"$BERT_PREP_WORKING_DIR/download/squad/v1.1"}
  23. vocab_file=${9:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"}
  24. OUT_DIR=${10:-"/workspace/bert/results/SQuAD"}
  25. mode=${11:-"train eval"}
  26. CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}
  27. max_steps=${13:-"-1"}
  28. echo "out dir is $OUT_DIR"
  29. mkdir -p $OUT_DIR
  30. if [ ! -d "$OUT_DIR" ]; then
  31. echo "ERROR: non existing $OUT_DIR"
  32. exit 1
  33. fi
  34. use_fp16=""
  35. if [ "$precision" = "fp16" ] ; then
  36. echo "fp16 activated!"
  37. use_fp16=" --fp16 "
  38. fi
  39. if [ "$num_gpu" = "1" ] ; then
  40. export CUDA_VISIBLE_DEVICES=0
  41. mpi_command=""
  42. else
  43. unset CUDA_VISIBLE_DEVICES
  44. mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu"
  45. fi
  46. CMD="python $mpi_command run_squad.py "
  47. CMD+="--init_checkpoint=$init_checkpoint "
  48. if [ "$mode" = "train" ] ; then
  49. CMD+="--do_train "
  50. CMD+="--train_file=$squad_dir/train-v1.1.json "
  51. CMD+="--train_batch_size=$batch_size "
  52. elif [ "$mode" = "eval" ] ; then
  53. CMD+="--do_predict "
  54. CMD+="--predict_file=$squad_dir/dev-v1.1.json "
  55. CMD+="--predict_batch_size=$batch_size "
  56. CMD+="--eval_script=$squad_dir/evaluate-v1.1.py "
  57. CMD+="--do_eval "
  58. elif [ "$mode" = "prediction" ] ; then
  59. CMD+="--do_predict "
  60. CMD+="--predict_file=$squad_dir/dev-v1.1.json "
  61. CMD+="--predict_batch_size=$batch_size "
  62. else
  63. CMD+=" --do_train "
  64. CMD+=" --train_file=$squad_dir/train-v1.1.json "
  65. CMD+=" --train_batch_size=$batch_size "
  66. CMD+="--do_predict "
  67. CMD+="--predict_file=$squad_dir/dev-v1.1.json "
  68. CMD+="--predict_batch_size=$batch_size "
  69. CMD+="--eval_script=$squad_dir/evaluate-v1.1.py "
  70. CMD+="--do_eval "
  71. fi
  72. CMD+=" --do_lower_case "
  73. CMD+=" --bert_model=bert-large-uncased "
  74. CMD+=" --learning_rate=$learning_rate "
  75. CMD+=" --seed=$seed "
  76. CMD+=" --num_train_epochs=$epochs "
  77. CMD+=" --max_seq_length=384 "
  78. CMD+=" --doc_stride=128 "
  79. CMD+=" --output_dir=$OUT_DIR "
  80. CMD+=" --vocab_file=$vocab_file "
  81. CMD+=" --config_file=$CONFIG_FILE "
  82. CMD+=" --max_steps=$max_steps "
  83. CMD+=" $use_fp16"
  84. LOGFILE=$OUT_DIR/logfile.txt
  85. echo "$CMD |& tee $LOGFILE"
  86. time $CMD |& tee $LOGFILE