| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- #!/bin/bash
- # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- bert_model=${1:-"large"}
- use_xla=${2:-"true"}
- num_gpu=${3:-"8"}
- task=${4:-"squad"}
- if [ "$bert_model" = "large" ] ; then
- export BERT_DIR=data/download/nvidia_pretrained/bert_tf_pretraining_large_lamb
- else
- export BERT_DIR=data/download/nvidia_pretrained/bert_tf_squad11_base_128
- fi
- echo "BERT directory set as " $BERT_DIR
- init_checkpoint="$BERT_DIR/bert_model.ckpt"
- learning_rate=5e-6
- #Edit to save logs & checkpoints in a different directory
- RESULTS_DIR=/results
- if [ ! -d "$RESULTS_DIR" ] ; then
- echo "Error! $RESULTS_DIR directory missing."
- exit -1
- fi
- echo "Results directory set as " $RESULTS_DIR
- if [ "$use_xla" = "true" ] ; then
- use_xla_tag="--use_xla"
- else
- use_xla_tag="--nouse_xla"
- fi
- if [ $num_gpu -gt 1 ] ; then
- mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
- --allow-run-as-root -bind-to none -map-by slot \
- -x NCCL_DEBUG=INFO \
- -x LD_LIBRARY_PATH \
- -x PATH -mca pml ob1 -mca btl ^openib"
- use_hvd="--horovod"
- else
- mpi_command=""
- use_hvd=""
- fi
- LOGFILE="${RESULTS_DIR}/${task}_training_benchmark_bert_${bert_model}_gpu_${num_gpu}.log"
- if [ "$task" = "squad" ] ; then
- export SQUAD_DIR=data/download/squad/v1.1
- epochs="2.0"
- echo "Squad directory set as " $SQUAD_DIR
- echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
- echo "Precision Sequence Length Batch size Performance(sent/sec)" >> $LOGFILE
- for seq_len in 128 384; do
- if [ "$seq_len" = "128" ] ; then
- doc_stride=64
- else
- doc_stride=128
- fi
- for batch_size in 1 2 4; do
- for use_fp16 in "--amp" "--noamp"; do
- res_dir=${RESULTS_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_len}_prec_${use_fp16}_bs_${batch_size}
- mkdir -p $res_dir
- tmp_file="${res_dir}/${task}_training_benchmark.log"
- $mpi_command python run_squad.py \
- --vocab_file=$BERT_DIR/vocab.txt \
- --bert_config_file=$BERT_DIR/bert_config.json \
- --init_checkpoint=$init_checkpoint \
- --do_train=True \
- --train_file=$SQUAD_DIR/train-v1.1.json \
- --train_batch_size=$batch_size \
- --learning_rate=$learning_rate \
- --num_train_epochs=$epochs \
- --max_seq_length=$seq_len \
- --doc_stride=$doc_stride \
- --output_dir=$res_dir \
- "$use_hvd" \
- "$use_fp16" \
- $use_xla_tag |& tee $tmp_file
- perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
- echo "$use_fp16 $seq_len $batch_size $perf" >> $LOGFILE
- done
- done
- done
- else
- echo "Benchmarking for " $task "currently not supported. Sorry!"
- fi
|