# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -ex echo "Container nvidia build = " $NVIDIA_BUILD_ID train_batch_size=${1:-256} learning_rate=${2:-"6e-3"} precision=${3:-"amp"} num_gpus=${4:-8} warmup_proportion=${5:-"0.2843"} train_steps=${6:-7038} save_checkpoint_steps=${7:-200} create_logfile=${8:-"false"} gradient_accumulation_steps=${9:-32} seed=${10:-12439} job_name=${11:-"bert_lamb_pretraining"} train_batch_size_phase2=${12:-32} learning_rate_phase2=${13:-"4e-3"} warmup_proportion_phase2=${14:-"0.128"} train_steps_phase2=${15:-1563} gradient_accumulation_steps_phase2=${16:-128} #change this for other datasets DATASET=pretrain/phase1/unbinned/parquet DATA_DIR_PHASE1=${17:-$BERT_PREP_WORKING_DIR/${DATASET}/} #change this for other datasets DATASET2=pretrain/phase2/bin_size_64/parquet DATA_DIR_PHASE2=${18:-$BERT_PREP_WORKING_DIR/${DATASET2}/} CODEDIR=${19:-"/workspace/bert"} init_checkpoint=${20:-"None"} VOCAB_FILE=vocab/bert-large-uncased-vocab.txt RESULTS_DIR=$CODEDIR/results CHECKPOINTS_DIR=$RESULTS_DIR wikipedia_source=${21:-$BERT_PREP_WORKING_DIR/wikipedia/source/} num_dask_workers=${22:-$(nproc)} num_shards_per_worker=${23:-128} num_workers=${24:-4} num_nodes=1 sample_ratio=${25:-0.9} phase2_bin_size=${26:-64} masking=${27:-static} BERT_CONFIG=${28:-"None"} enable_benchmark=${29:-"false"} benchmark_steps=${30:-"10"} benchmark_warmup_steps=${31:-"10"} fuse_mha=${32:-"true"} # Calculate the total number of shards. readonly num_blocks=$((num_shards_per_worker * $(( num_workers > 0 ? num_workers : 1 )) * num_nodes * num_gpus)) if [ "${phase2_bin_size}" == "none" ]; then readonly phase2_bin_size_flag="" elif [[ "${phase2_bin_size}" =~ ^(32|64|128|256|512)$ ]]; then readonly phase2_bin_size_flag="--bin-size ${phase2_bin_size}" else echo "Error! phase2_bin_size=${phase2_bin_size} not supported!" return -1 fi if [ "${masking}" == "static" ]; then readonly masking_flag="--masking" elif [ "${masking}" == "dynamic" ]; then readonly masking_flag="" else echo "Error! masking=${masking} not supported!" return -1 fi mkdir -p $CHECKPOINTS_DIR if [ ! -d "${DATA_DIR_PHASE1}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE1})" ]; then echo "Warning! ${DATA_DIR_PHASE1} directory missing." if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then echo "Error! ${wikipedia_source} directory missing. Training cannot start!" return -1 fi preprocess_cmd=" \ mpirun \ --oversubscribe \ --allow-run-as-root \ -np ${num_dask_workers} \ -x LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so \ preprocess_bert_pretrain \ --schedule mpi \ --vocab-file ${VOCAB_FILE} \ --wikipedia ${wikipedia_source} \ --sink ${DATA_DIR_PHASE1} \ --num-blocks ${num_blocks} \ --sample-ratio ${sample_ratio} \ ${masking_flag} \ --seed ${seed}" echo "Running ${preprocess_cmd} ..." ${preprocess_cmd} balance_load_cmd=" \ mpirun \ --oversubscribe \ --allow-run-as-root \ -np ${num_dask_workers} \ balance_dask_output \ --indir ${DATA_DIR_PHASE1} \ --num-shards ${num_blocks}" echo "Running ${balance_load_cmd} ..." ${balance_load_cmd} fi if [ ! -d "$RESULTS_DIR" ] ; then echo "Error! $RESULTS_DIR directory missing." exit -1 fi if [ ! -d "$CHECKPOINTS_DIR" ] ; then echo "Warning! $CHECKPOINTS_DIR directory missing." echo "Checkpoints will be written to $RESULTS_DIR instead." CHECKPOINTS_DIR=$RESULTS_DIR fi CONFIG="" if [ "$BERT_CONFIG" != "None" ] ; then CONFIG="--config-file=$BERT_CONFIG" fi PREC="" FUSE_MHA="" if [ "$precision" = "amp" ] ; then PREC="--amp --use-dynamic-loss-scaling --scale-loss=1048576" if [ "$fuse_mha" = "true" ] ; then FUSE_MHA="--fuse-mha" fi elif [ "$precision" = "fp32" ] ; then PREC="" elif [ "$precision" = "tf32" ] ; then PREC="" else echo "Unknown argument" exit -2 fi ACCUMULATE_GRADIENTS="--gradient-merge-steps=$gradient_accumulation_steps" INIT_CHECKPOINT="" if [ "$init_checkpoint" != "None" ] ; then INIT_CHECKPOINT="--from-checkpoint=$init_checkpoint --last-step-of-checkpoint=auto" fi BENCH="" if [ "$enable_benchmark" = "true" ] ; then BENCH="--benchmark --benchmark-steps=$benchmark_steps --benchmark-warmup-steps=$benchmark_warmup_steps" fi unset CUDA_VISIBLE_DEVICES if [ "$num_gpus" = "1" ] ; then DIST_CMD="python -m paddle.distributed.launch --gpus=0" elif [ "$num_gpus" = "2" ] ; then DIST_CMD="python -m paddle.distributed.launch --gpus=0,1" elif [ "$num_gpus" = "3" ] ; then DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2" elif [ "$num_gpus" = "4" ] ; then DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3" elif [ "$num_gpus" = "5" ] ; then DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4" elif [ "$num_gpus" = "6" ] ; then DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5" elif [ "$num_gpus" = "7" ] ; then DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6" elif [ "$num_gpus" = "8" ] ; then DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7" else echo "Wrong number of gpus" exit -2 fi echo $DATA_DIR_PHASE1 INPUT_DIR=$DATA_DIR_PHASE1 CMD=" $CODEDIR/run_pretraining.py" CMD+=" --input-dir=$DATA_DIR_PHASE1" CMD+=" --vocab-file=$VOCAB_FILE" CMD+=" --output-dir=$CHECKPOINTS_DIR" CMD+=" $CONFIG " CMD+=" --bert-model=bert-large-uncased" CMD+=" --batch-size=$train_batch_size" CMD+=" --max-seq-length=128" CMD+=" --max-predictions-per-seq=20" CMD+=" --max-steps=$train_steps" CMD+=" --warmup-proportion=$warmup_proportion" CMD+=" --num-steps-per-checkpoint=$save_checkpoint_steps" CMD+=" --learning-rate=$learning_rate" CMD+=" --seed=$seed" CMD+=" --log-freq=1" CMD+=" --optimizer=Lamb" CMD+=" --phase1" CMD+=" $PREC" CMD+=" $FUSE_MHA" CMD+=" $ACCUMULATE_GRADIENTS" CMD+=" $INIT_CHECKPOINT" CMD+=" $BENCH" CMD+=" --report-file ${RESULTS_DIR}/dllogger_p1.json " CMD="$DIST_CMD $CMD" if [ "$create_logfile" = "true" ] ; then export GBS=$(expr $train_batch_size \* $num_gpus \* $gradient_accumulation_steps) printf -v TAG "paddle_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS DATESTAMP=`date +'%y%m%d%H%M%S'` LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log printf "Logs written to %s\n" "$LOGFILE" fi set -x if [ -z "$LOGFILE" ] ; then $CMD else ( $CMD ) |& tee $LOGFILE fi set +x echo "finished pretraining" #Start Phase2 PREC="" if [ "$precision" = "amp" ] ; then PREC="--amp --use-dynamic-loss-scaling --scale-loss=1048576" elif [ "$precision" = "fp32" ] ; then PREC="" elif [ "$precision" = "tf32" ] ; then PREC="" else echo "Unknown argument" exit -2 fi ACCUMULATE_GRADIENTS="--gradient-merge-steps=$gradient_accumulation_steps_phase2" if [ ! -d "${DATA_DIR_PHASE2}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE2})" ]; then echo "Warning! ${DATA_DIR_PHASE2} directory missing." if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then echo "Error! ${wikipedia_source} directory missing. Training cannot start!" return -1 fi preprocess_cmd=" \ mpirun \ --oversubscribe \ --allow-run-as-root \ -np ${num_dask_workers} \ -x LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so \ preprocess_bert_pretrain \ --schedule mpi \ --vocab-file ${VOCAB_FILE} \ --wikipedia ${wikipedia_source} \ --sink ${DATA_DIR_PHASE2} \ --target-seq-length 512 \ --num-blocks ${num_blocks} \ --sample-ratio ${sample_ratio} \ ${phase2_bin_size_flag} \ ${masking_flag} \ --seed ${seed}" echo "Running ${preprocess_cmd} ..." ${preprocess_cmd} balance_load_cmd=" \ mpirun \ --oversubscribe \ --allow-run-as-root \ -np ${num_dask_workers} \ balance_dask_output \ --indir ${DATA_DIR_PHASE2} \ --num-shards ${num_blocks}" echo "Running ${balance_load_cmd} ..." ${balance_load_cmd} fi echo $DATA_DIR_PHASE2 INPUT_DIR=$DATA_DIR_PHASE2 PHASE1_END_CKPT_DIR="${CHECKPOINTS_DIR}/bert-large-uncased/phase1/${train_steps}" CMD=" $CODEDIR/run_pretraining.py" CMD+=" --input-dir=$DATA_DIR_PHASE2" CMD+=" --vocab-file=$VOCAB_FILE" CMD+=" --output-dir=$CHECKPOINTS_DIR" CMD+=" $CONFIG " CMD+=" --bert-model=bert-large-uncased" CMD+=" --batch-size=$train_batch_size_phase2" CMD+=" --max-seq-length=512" CMD+=" --max-predictions-per-seq=80" CMD+=" --max-steps=$train_steps_phase2" CMD+=" --warmup-proportion=$warmup_proportion_phase2" CMD+=" --num-steps-per-checkpoint=$save_checkpoint_steps" CMD+=" --learning-rate=$learning_rate_phase2" CMD+=" --seed=$seed" CMD+=" --log-freq=1" CMD+=" --optimizer=Lamb" CMD+=" $PREC" CMD+=" $ACCUMULATE_GRADIENTS" CMD+=" $BENCH" CMD+=" --from-pretrained-params=${PHASE1_END_CKPT_DIR} " CMD+=" --phase2 " CMD+=" --report-file ${RESULTS_DIR}/dllogger_p2.json " CMD="$DIST_CMD $CMD" if [ "$create_logfile" = "true" ] ; then export GBS=$(expr $train_batch_size_phase2 \* $num_gpus \* $gradient_accumulation_steps_phase2) printf -v TAG "paddle_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS DATESTAMP=`date +'%y%m%d%H%M%S'` LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log printf "Logs written to %s\n" "$LOGFILE" fi set -x if [ -z "$LOGFILE" ] ; then $CMD else ( $CMD ) |& tee $LOGFILE fi set +x echo "finished phase2"