| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- # Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- echo "Container nvidia build = " $NVIDIA_BUILD_ID
- init_checkpoint=${1:-"checkpoints/squad"}
- epochs=${2:-"2"}
- batch_size=${3:-"32"}
- learning_rate=${4:-"4.6e-5"}
- warmup_proportion=${5:-"0.2"}
- precision=${6:-"amp"}
- num_gpus=${7:-"8"}
- seed=${8:-"1"}
- squad_dir=${9:-"$BERT_PREP_WORKING_DIR/download/squad/v1.1"}
- vocab_file=${10:-"vocab/bert-large-uncased-vocab.txt"}
- OUT_DIR=${11:-"/results"}
- mode=${12:-"train_eval"}
- CONFIG_FILE=${13:-"None"}
- max_steps=${14:-"-1"}
- enable_benchmark=${15:-"false"}
- benchmark_steps=${16:-"100"}
- benchmark_warmup_steps=${17:-"100"}
- fuse_mha=${18:-"true"}
- echo "out dir is $OUT_DIR"
- mkdir -p $OUT_DIR
- if [ ! -d "$OUT_DIR" ]; then
- echo "ERROR: non existing $OUT_DIR"
- exit 1
- fi
- amp=""
- FUSE_MHA=""
- if [ "$precision" = "amp" ] ; then
- echo "amp activated!"
- amp=" --amp --use-dynamic-loss-scaling --scale-loss=128.0"
- if [ "$fuse_mha" = "true" ] ; then
- FUSE_MHA="--fuse-mha"
- fi
- fi
- CONFIG=""
- if [ "$CONFIG_FILE" != "None" ] ; then
- CONFIG="--config-file=$CONFIG_FILE"
- fi
- BENCH=""
- if [ "$enable_benchmark" = "true" ] ; then
- BENCH="--benchmark --benchmark-steps=$benchmark_steps --benchmark-warmup-steps=$benchmark_warmup_steps"
- fi
- unset CUDA_VISIBLE_DEVICES
- if [ "$num_gpus" = "1" ] ; then
- CMD="python -m paddle.distributed.launch --gpus=0"
- elif [ "$num_gpus" = "2" ] ; then
- CMD="python -m paddle.distributed.launch --gpus=0,1"
- elif [ "$num_gpus" = "3" ] ; then
- CMD="python -m paddle.distributed.launch --gpus=0,1,2"
- elif [ "$num_gpus" = "4" ] ; then
- CMD="python -m paddle.distributed.launch --gpus=0,1,2,3"
- elif [ "$num_gpus" = "5" ] ; then
- CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4"
- elif [ "$num_gpus" = "6" ] ; then
- CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5"
- elif [ "$num_gpus" = "7" ] ; then
- CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6"
- elif [ "$num_gpus" = "8" ] ; then
- CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7"
- else
- echo "Wrong number of gpus"
- exit 2
- fi
- CMD+=" run_squad.py "
- CMD+="--from-pretrained-params=$init_checkpoint "
- if [ "$mode" = "train" ] ; then
- CMD+="--do-train "
- CMD+="--train-file=$squad_dir/train-v1.1.json "
- CMD+="--train-batch-size=$batch_size "
- elif [ "$mode" = "eval" ] ; then
- CMD+="--do-predict "
- CMD+="--predict-file=$squad_dir/dev-v1.1.json "
- CMD+="--predict-batch-size=$batch_size "
- CMD+="--eval-script=$squad_dir/evaluate-v1.1.py "
- CMD+="--do-eval "
- elif [ "$mode" = "prediction" ] ; then
- CMD+="--do-predict "
- CMD+="--predict-file=$squad_dir/dev-v1.1.json "
- CMD+="--predict-batch-size=$batch_size "
- else
- CMD+=" --do-train "
- CMD+=" --train-file=$squad_dir/train-v1.1.json "
- CMD+=" --train-batch-size=$batch_size "
- CMD+="--do-predict "
- CMD+="--predict-file=$squad_dir/dev-v1.1.json "
- CMD+="--predict-batch-size=$batch_size "
- CMD+="--eval-script=$squad_dir/evaluate-v1.1.py "
- CMD+="--do-eval "
- fi
- CMD+=" --do-lower-case "
- CMD+=" --bert-model=bert-large-uncased "
- CMD+=" --learning-rate=$learning_rate "
- CMD+=" --seed=$seed "
- CMD+=" --epochs=$epochs "
- CMD+=" --max-seq-length=384 "
- CMD+=" --doc-stride=128 "
- CMD+=" --output-dir=$OUT_DIR "
- CMD+=" --vocab-file=$vocab_file "
- CMD+=" $CONFIG "
- CMD+=" --max-steps=$max_steps "
- CMD+=" --optimizer=AdamW "
- CMD+=" --log-freq=100 "
- CMD+=" $amp "
- CMD+=" $FUSE_MHA "
- CMD+=" $BENCH "
- CMD+=" --report-file $OUT_DIR/dllogger_${num_gpus}_${precision}.json "
- LOGFILE=$OUT_DIR/logfile.txt
- echo "$CMD |& tee $LOGFILE"
- time $CMD |& tee $LOGFILE
|