| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258 |
- #!/bin/bash
- # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- echo "Container nvidia build = " $NVIDIA_BUILD_ID
- train_batch_size=${1:-8192}
- learning_rate=${2:-"6e-3"}
- precision=${3:-"fp16"}
- num_gpus=${4:-8}
- warmup_proportion=${5:-"0.2843"}
- train_steps=${6:-7038}
- save_checkpoint_steps=${7:-200}
- resume_training=${8:-"false"}
- create_logfile=${9:-"true"}
- accumulate_gradients=${10:-"true"}
- gradient_accumulation_steps=${11:-128}
- seed=${12:-$RANDOM}
- job_name=${13:-"bert_lamb_pretraining"}
- allreduce_post_accumulation=${14:-"true"}
- allreduce_post_accumulation_fp16=${15:-"true"}
- accumulate_into_fp16=${16:-"false"}
- train_batch_size_phase2=${1:-4096}
- learning_rate_phase2=${2:-"4e-3"}
- warmup_proportion_phase2=${5:-"0.128"}
- train_steps_phase2=${6:-1563}
- gradient_accumulation_steps_phase2=${11:-512}
- DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus # change this for other datasets
- DATA_DIR=$BERT_PREP_WORKING_DIR/${DATASET}/
- BERT_CONFIG=bert_config.json
- RESULTS_DIR=/workspace/bert/results
- CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
- mkdir -p $CHECKPOINTS_DIR
- if [ ! -d "$DATA_DIR" ] ; then
- echo "Warning! $DATA_DIR directory missing. Training cannot start"
- fi
- if [ ! -d "$RESULTS_DIR" ] ; then
- echo "Error! $RESULTS_DIR directory missing."
- exit -1
- fi
- if [ ! -d "$CHECKPOINTS_DIR" ] ; then
- echo "Warning! $CHECKPOINTS_DIR directory missing."
- echo "Checkpoints will be written to $RESULTS_DIR instead."
- CHECKPOINTS_DIR=$RESULTS_DIR
- fi
- if [ ! -f "$BERT_CONFIG" ] ; then
- echo "Error! BERT large configuration file not found at $BERT_CONFIG"
- exit -1
- fi
- PREC=""
- if [ "$precision" = "fp16" ] ; then
- PREC="--fp16"
- elif [ "$precision" = "fp32" ] ; then
- PREC=""
- else
- echo "Unknown <precision> argument"
- exit -2
- fi
- ACCUMULATE_GRADIENTS=""
- if [ "$accumulate_gradients" == "true" ] ; then
- ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps"
- fi
- CHECKPOINT=""
- if [ "$resume_training" == "true" ] ; then
- CHECKPOINT="--resume_from_checkpoint"
- fi
- ALL_REDUCE_POST_ACCUMULATION=""
- if [ "$allreduce_post_accumulation" == "true" ] ; then
- ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
- fi
- ALL_REDUCE_POST_ACCUMULATION_FP16=""
- if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
- ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
- fi
- ACCUMULATE_INTO_FP16=""
- if [ "$accumulate_into_fp16" == "true" ] ; then
- ACCUMULATE_INTO_FP16="--accumulate_into_fp16"
- fi
- echo $DATA_DIR
- INPUT_DIR=$DATA_DIR
- CMD=" /workspace/bert/run_pretraining.py"
- CMD+=" --input_dir=$DATA_DIR"
- CMD+=" --output_dir=$CHECKPOINTS_DIR"
- CMD+=" --config_file=$BERT_CONFIG"
- CMD+=" --bert_model=bert-large-uncased"
- CMD+=" --train_batch_size=$train_batch_size"
- CMD+=" --max_seq_length=128"
- CMD+=" --max_predictions_per_seq=20"
- CMD+=" --max_steps=$train_steps"
- CMD+=" --warmup_proportion=$warmup_proportion"
- CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
- CMD+=" --learning_rate=$learning_rate"
- CMD+=" --seed=$seed"
- CMD+=" $PREC"
- CMD+=" $ACCUMULATE_GRADIENTS"
- CMD+=" $CHECKPOINT"
- CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
- CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
- CMD+=" $ACCUMULATE_INTO_FP16"
- CMD+=" --do_train"
- if [ "$num_gpus" -gt 1 ] ; then
- CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
- else
- CMD="python3 $CMD"
- fi
- if [ "$create_logfile" = "true" ] ; then
- export GBS=$(expr $train_batch_size \* $num_gpus)
- printf -v TAG "pyt_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS
- DATESTAMP=`date +'%y%m%d%H%M%S'`
- LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
- printf "Logs written to %s\n" "$LOGFILE"
- fi
- set -x
- if [ -z "$LOGFILE" ] ; then
- $CMD
- else
- (
- $CMD
- ) |& tee $LOGFILE
- fi
- set +x
- echo "finished pretraining, starting benchmarking"
- target_loss=15
- THROUGHPUT=10
- THRESHOLD=0.9
- throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
- loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
- final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
- train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size')}')
- echo " training throughput phase1: $train_perf sequences/second"
- echo "average loss: $loss"
- echo "final loss: $final_loss"
- #Start Phase2
- DATASET=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus # change this for other datasets
- DATA_DIR=$BERT_PREP_WORKING_DIR/${DATASET}/
- PREC=""
- if [ "$precision" = "fp16" ] ; then
- PREC="--fp16"
- elif [ "$precision" = "fp32" ] ; then
- PREC=""
- else
- echo "Unknown <precision> argument"
- exit -2
- fi
- ACCUMULATE_GRADIENTS=""
- if [ "$accumulate_gradients" == "true" ] ; then
- ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2"
- fi
- ALL_REDUCE_POST_ACCUMULATION=""
- if [ "$allreduce_post_accumulation" == "true" ] ; then
- ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
- fi
- ALL_REDUCE_POST_ACCUMULATION_FP16=""
- if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
- ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
- fi
- ACCUMULATE_INTO_FP16=""
- if [ "$accumulate_into_fp16" == "true" ] ; then
- ACCUMULATE_INTO_FP16="--accumulate_into_fp16"
- fi
- echo $DATA_DIR
- INPUT_DIR=$DATA_DIR
- CMD=" /workspace/bert/run_pretraining.py"
- CMD+=" --input_dir=$DATA_DIR"
- CMD+=" --output_dir=$CHECKPOINTS_DIR"
- CMD+=" --config_file=$BERT_CONFIG"
- CMD+=" --bert_model=bert-large-uncased"
- CMD+=" --train_batch_size=$train_batch_size_phase2"
- CMD+=" --max_seq_length=512"
- CMD+=" --max_predictions_per_seq=80"
- CMD+=" --max_steps=$train_steps_phase2"
- CMD+=" --warmup_proportion=$warmup_proportion_phase2"
- CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
- CMD+=" --learning_rate=$learning_rate_phase2"
- CMD+=" --seed=$seed"
- CMD+=" $PREC"
- CMD+=" $ACCUMULATE_GRADIENTS"
- CMD+=" $CHECKPOINT"
- CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
- CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
- CMD+=" $ACCUMULATE_INTO_FP16"
- CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
- if [ "$num_gpus" -gt 1 ] ; then
- CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
- else
- CMD="python3 $CMD"
- fi
- if [ "$create_logfile" = "true" ] ; then
- export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
- printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
- DATESTAMP=`date +'%y%m%d%H%M%S'`
- LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
- printf "Logs written to %s\n" "$LOGFILE"
- fi
- set -x
- if [ -z "$LOGFILE" ] ; then
- $CMD
- else
- (
- $CMD
- ) |& tee $LOGFILE
- fi
- set +x
- echo "finished phase2"
- throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
- loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
- final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
- train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size_phase2')}')
- echo " training throughput phase2: $train_perf sequences/second"
- echo "average loss: $loss"
- echo "final loss: $final_loss"
|