| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336 |
- # Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- set -ex
- echo "Container nvidia build = " $NVIDIA_BUILD_ID
- train_batch_size=${1:-256}
- learning_rate=${2:-"6e-3"}
- precision=${3:-"amp"}
- num_gpus=${4:-8}
- warmup_proportion=${5:-"0.2843"}
- train_steps=${6:-7038}
- save_checkpoint_steps=${7:-200}
- create_logfile=${8:-"false"}
- gradient_accumulation_steps=${9:-32}
- seed=${10:-12439}
- job_name=${11:-"bert_lamb_pretraining"}
- train_batch_size_phase2=${12:-32}
- learning_rate_phase2=${13:-"4e-3"}
- warmup_proportion_phase2=${14:-"0.128"}
- train_steps_phase2=${15:-1563}
- gradient_accumulation_steps_phase2=${16:-128}
- #change this for other datasets
- DATASET=pretrain/phase1/unbinned/parquet
- DATA_DIR_PHASE1=${17:-$BERT_PREP_WORKING_DIR/${DATASET}/}
- #change this for other datasets
- DATASET2=pretrain/phase2/bin_size_64/parquet
- DATA_DIR_PHASE2=${18:-$BERT_PREP_WORKING_DIR/${DATASET2}/}
- CODEDIR=${19:-"/workspace/bert"}
- init_checkpoint=${20:-"None"}
- VOCAB_FILE=vocab/bert-large-uncased-vocab.txt
- RESULTS_DIR=$CODEDIR/results
- CHECKPOINTS_DIR=$RESULTS_DIR
- wikipedia_source=${21:-$BERT_PREP_WORKING_DIR/wikipedia/source/}
- num_dask_workers=${22:-$(nproc)}
- num_shards_per_worker=${23:-128}
- num_workers=${24:-4}
- num_nodes=1
- sample_ratio=${25:-0.9}
- phase2_bin_size=${26:-64}
- masking=${27:-static}
- BERT_CONFIG=${28:-"None"}
- enable_benchmark=${29:-"false"}
- benchmark_steps=${30:-"10"}
- benchmark_warmup_steps=${31:-"10"}
- fuse_mha=${32:-"true"}
- # Calculate the total number of shards.
- readonly num_blocks=$((num_shards_per_worker * $(( num_workers > 0 ? num_workers : 1 )) * num_nodes * num_gpus))
- if [ "${phase2_bin_size}" == "none" ]; then
- readonly phase2_bin_size_flag=""
- elif [[ "${phase2_bin_size}" =~ ^(32|64|128|256|512)$ ]]; then
- readonly phase2_bin_size_flag="--bin-size ${phase2_bin_size}"
- else
- echo "Error! phase2_bin_size=${phase2_bin_size} not supported!"
- return -1
- fi
- if [ "${masking}" == "static" ]; then
- readonly masking_flag="--masking"
- elif [ "${masking}" == "dynamic" ]; then
- readonly masking_flag=""
- else
- echo "Error! masking=${masking} not supported!"
- return -1
- fi
- mkdir -p $CHECKPOINTS_DIR
- if [ ! -d "${DATA_DIR_PHASE1}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE1})" ]; then
- echo "Warning! ${DATA_DIR_PHASE1} directory missing."
- if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then
- echo "Error! ${wikipedia_source} directory missing. Training cannot start!"
- return -1
- fi
- preprocess_cmd=" \
- mpirun \
- --oversubscribe \
- --allow-run-as-root \
- -np ${num_dask_workers} \
- -x LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so \
- preprocess_bert_pretrain \
- --schedule mpi \
- --vocab-file ${VOCAB_FILE} \
- --wikipedia ${wikipedia_source} \
- --sink ${DATA_DIR_PHASE1} \
- --num-blocks ${num_blocks} \
- --sample-ratio ${sample_ratio} \
- ${masking_flag} \
- --seed ${seed}"
- echo "Running ${preprocess_cmd} ..."
- ${preprocess_cmd}
- balance_load_cmd=" \
- mpirun \
- --oversubscribe \
- --allow-run-as-root \
- -np ${num_dask_workers} \
- balance_dask_output \
- --indir ${DATA_DIR_PHASE1} \
- --num-shards ${num_blocks}"
- echo "Running ${balance_load_cmd} ..."
- ${balance_load_cmd}
- fi
- if [ ! -d "$RESULTS_DIR" ] ; then
- echo "Error! $RESULTS_DIR directory missing."
- exit -1
- fi
- if [ ! -d "$CHECKPOINTS_DIR" ] ; then
- echo "Warning! $CHECKPOINTS_DIR directory missing."
- echo "Checkpoints will be written to $RESULTS_DIR instead."
- CHECKPOINTS_DIR=$RESULTS_DIR
- fi
- CONFIG=""
- if [ "$BERT_CONFIG" != "None" ] ; then
- CONFIG="--config-file=$BERT_CONFIG"
- fi
- PREC=""
- FUSE_MHA=""
- if [ "$precision" = "amp" ] ; then
- PREC="--amp --use-dynamic-loss-scaling --scale-loss=1048576"
- if [ "$fuse_mha" = "true" ] ; then
- FUSE_MHA="--fuse-mha"
- fi
- elif [ "$precision" = "fp32" ] ; then
- PREC=""
- elif [ "$precision" = "tf32" ] ; then
- PREC=""
- else
- echo "Unknown <precision> argument"
- exit -2
- fi
- ACCUMULATE_GRADIENTS="--gradient-merge-steps=$gradient_accumulation_steps"
- INIT_CHECKPOINT=""
- if [ "$init_checkpoint" != "None" ] ; then
- INIT_CHECKPOINT="--from-checkpoint=$init_checkpoint --last-step-of-checkpoint=auto"
- fi
- BENCH=""
- if [ "$enable_benchmark" = "true" ] ; then
- BENCH="--benchmark --benchmark-steps=$benchmark_steps --benchmark-warmup-steps=$benchmark_warmup_steps"
- fi
- unset CUDA_VISIBLE_DEVICES
- if [ "$num_gpus" = "1" ] ; then
- DIST_CMD="python -m paddle.distributed.launch --gpus=0"
- elif [ "$num_gpus" = "2" ] ; then
- DIST_CMD="python -m paddle.distributed.launch --gpus=0,1"
- elif [ "$num_gpus" = "3" ] ; then
- DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2"
- elif [ "$num_gpus" = "4" ] ; then
- DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3"
- elif [ "$num_gpus" = "5" ] ; then
- DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4"
- elif [ "$num_gpus" = "6" ] ; then
- DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5"
- elif [ "$num_gpus" = "7" ] ; then
- DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6"
- elif [ "$num_gpus" = "8" ] ; then
- DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7"
- else
- echo "Wrong number of gpus"
- exit -2
- fi
- echo $DATA_DIR_PHASE1
- INPUT_DIR=$DATA_DIR_PHASE1
- CMD=" $CODEDIR/run_pretraining.py"
- CMD+=" --input-dir=$DATA_DIR_PHASE1"
- CMD+=" --vocab-file=$VOCAB_FILE"
- CMD+=" --output-dir=$CHECKPOINTS_DIR"
- CMD+=" $CONFIG "
- CMD+=" --bert-model=bert-large-uncased"
- CMD+=" --batch-size=$train_batch_size"
- CMD+=" --max-seq-length=128"
- CMD+=" --max-predictions-per-seq=20"
- CMD+=" --max-steps=$train_steps"
- CMD+=" --warmup-proportion=$warmup_proportion"
- CMD+=" --num-steps-per-checkpoint=$save_checkpoint_steps"
- CMD+=" --learning-rate=$learning_rate"
- CMD+=" --seed=$seed"
- CMD+=" --log-freq=1"
- CMD+=" --optimizer=Lamb"
- CMD+=" --phase1"
- CMD+=" $PREC"
- CMD+=" $FUSE_MHA"
- CMD+=" $ACCUMULATE_GRADIENTS"
- CMD+=" $INIT_CHECKPOINT"
- CMD+=" $BENCH"
- CMD+=" --report-file ${RESULTS_DIR}/dllogger_p1.json "
- CMD="$DIST_CMD $CMD"
- if [ "$create_logfile" = "true" ] ; then
- export GBS=$(expr $train_batch_size \* $num_gpus \* $gradient_accumulation_steps)
- printf -v TAG "paddle_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS
- DATESTAMP=`date +'%y%m%d%H%M%S'`
- LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
- printf "Logs written to %s\n" "$LOGFILE"
- fi
- set -x
- if [ -z "$LOGFILE" ] ; then
- $CMD
- else
- (
- $CMD
- ) |& tee $LOGFILE
- fi
- set +x
- echo "finished pretraining"
- #Start Phase2
- PREC=""
- if [ "$precision" = "amp" ] ; then
- PREC="--amp --use-dynamic-loss-scaling --scale-loss=1048576"
- elif [ "$precision" = "fp32" ] ; then
- PREC=""
- elif [ "$precision" = "tf32" ] ; then
- PREC=""
- else
- echo "Unknown <precision> argument"
- exit -2
- fi
- ACCUMULATE_GRADIENTS="--gradient-merge-steps=$gradient_accumulation_steps_phase2"
- if [ ! -d "${DATA_DIR_PHASE2}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE2})" ]; then
- echo "Warning! ${DATA_DIR_PHASE2} directory missing."
- if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then
- echo "Error! ${wikipedia_source} directory missing. Training cannot start!"
- return -1
- fi
- preprocess_cmd=" \
- mpirun \
- --oversubscribe \
- --allow-run-as-root \
- -np ${num_dask_workers} \
- -x LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so \
- preprocess_bert_pretrain \
- --schedule mpi \
- --vocab-file ${VOCAB_FILE} \
- --wikipedia ${wikipedia_source} \
- --sink ${DATA_DIR_PHASE2} \
- --target-seq-length 512 \
- --num-blocks ${num_blocks} \
- --sample-ratio ${sample_ratio} \
- ${phase2_bin_size_flag} \
- ${masking_flag} \
- --seed ${seed}"
- echo "Running ${preprocess_cmd} ..."
- ${preprocess_cmd}
- balance_load_cmd=" \
- mpirun \
- --oversubscribe \
- --allow-run-as-root \
- -np ${num_dask_workers} \
- balance_dask_output \
- --indir ${DATA_DIR_PHASE2} \
- --num-shards ${num_blocks}"
- echo "Running ${balance_load_cmd} ..."
- ${balance_load_cmd}
- fi
- echo $DATA_DIR_PHASE2
- INPUT_DIR=$DATA_DIR_PHASE2
- PHASE1_END_CKPT_DIR="${CHECKPOINTS_DIR}/bert-large-uncased/phase1/${train_steps}"
- CMD=" $CODEDIR/run_pretraining.py"
- CMD+=" --input-dir=$DATA_DIR_PHASE2"
- CMD+=" --vocab-file=$VOCAB_FILE"
- CMD+=" --output-dir=$CHECKPOINTS_DIR"
- CMD+=" $CONFIG "
- CMD+=" --bert-model=bert-large-uncased"
- CMD+=" --batch-size=$train_batch_size_phase2"
- CMD+=" --max-seq-length=512"
- CMD+=" --max-predictions-per-seq=80"
- CMD+=" --max-steps=$train_steps_phase2"
- CMD+=" --warmup-proportion=$warmup_proportion_phase2"
- CMD+=" --num-steps-per-checkpoint=$save_checkpoint_steps"
- CMD+=" --learning-rate=$learning_rate_phase2"
- CMD+=" --seed=$seed"
- CMD+=" --log-freq=1"
- CMD+=" --optimizer=Lamb"
- CMD+=" $PREC"
- CMD+=" $ACCUMULATE_GRADIENTS"
- CMD+=" $BENCH"
- CMD+=" --from-pretrained-params=${PHASE1_END_CKPT_DIR} "
- CMD+=" --phase2 "
- CMD+=" --report-file ${RESULTS_DIR}/dllogger_p2.json "
- CMD="$DIST_CMD $CMD"
- if [ "$create_logfile" = "true" ] ; then
- export GBS=$(expr $train_batch_size_phase2 \* $num_gpus \* $gradient_accumulation_steps_phase2)
- printf -v TAG "paddle_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
- DATESTAMP=`date +'%y%m%d%H%M%S'`
- LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
- printf "Logs written to %s\n" "$LOGFILE"
- fi
- set -x
- if [ -z "$LOGFILE" ] ; then
- $CMD
- else
- (
- $CMD
- ) |& tee $LOGFILE
- fi
- set +x
- echo "finished phase2"
|