SunnyMirror
/
DeepLearningExamples
mirror da https://github.com/NVIDIA/DeepLearningExamples.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
							#!/bin/bash
#SBATCH --exclusive
#SBATCH --mem=0
#SBATCH --overcommit
#SBATCH --parsable

# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -eux

#
# Job Configurations
#
# Tag to the built image.
IMAGE_VERSION=${IMAGE_VERSION:-"21.11-py3"}
# Number of processes per node used for the LDDL preprocessor.
DASK_TASKS_PER_NODE=${DASK_TASKS_PER_NODE:-128}
# 1 or 2 .
PHASE=${PHASE:-1}
# An integer that specifies the pretraining seed. 
SEED=${SEED:-42}
# The percentage of the articles from the Wikipedia dataset to sample and used
# for pretraining. 0 < ${SAMPLE_RATIO} < 1.0
SAMPLE_RATIO=${SAMPLE_RATIO:-0.9}
# How many global steps to run before ending the pretraining job. This argument
# does not impact the learning rate schedule, but only if the pretraining job 
# should exit early. 'none' or an integer.
STEPS_THIS_RUN=${STEPS_THIS_RUN:-"none"}
# Number of GPUs per node. 0 < ${GPUS} <= 8.
GPUS=${GPUS:-"8"}
# The bin size for binned LDDL data loading. 'none' or an integer that divides 
# 128 (for Phase1) or 512 (for Phase2).
BIN_SIZE=${BIN_SIZE:-"64"}
# Number of parquet shards per each LDDL data loader worker process. 'none' or 
# an integer.
NUM_SHARDS_PER_WORKER=${NUM_SHARDS_PER_WORKER:-"none"}
# Number of LDDL data loader worker processes per rank.
NUM_WORKERS=${NUM_WORKERS:-4}
# Should we rerun the LDDL preprocessor every time? 'true' or 'false' .
RERUN_DASK=${RERUN_DASK:-"true"}
# 'static' or 'dynamic' .
MASKING=${MASKING:-"static"}
# Should we use jemalloc for the LDDL preprocessor? 'true' or 'false' .
USE_JEMALLOC=${USE_JEMALLOC:-"true"}
# 'fp16' or 'tf32' .
PRECISION=${PRECISION:-"fp16"}
# 'base' or 'large' .
CONFIG=${CONFIG:-"large"}
# The path to the initial checkpoint (from Phase1) used to start Phase2. 'none'
# or an absolute path.
INIT_CHECKPOINT=${INIT_CHECKPOINT:-"none"}
# The per-rank batch size before being divided by the gradient accumulation
# steps.
TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-"8192"}
# The gradient accumulation steps.
GRADIENT_ACCUMULATION_STEPS=${GRADIENT_ACCUMULATION_STEPS:-"32"}

#
# Static Configurations
#
# Container URL.
# Replace this with the URL of the docker image that you build 
# with scripts/docker/build.sh .
readonly docker_image="bert:${IMAGE_VERSION}" 
# Where the datasets are stored on the system.
readonly host_datadir="/home/${USER}/datasets"
readonly container_datadir="/datasets"
# Replace these with the path to the 'source' subdirectory of the LDDL Wikipedia
# dataset.
readonly host_wikipedia_source="${host_datadir}/wikipedia/source"
readonly container_wikipedia_source="${container_datadir}/wikipedia/source"
readonly wikipedia_mount="${host_wikipedia_source}:${container_wikipedia_source}"
# Replace these with where you want to store the Parquet shards in case 
# ${RERUN_DASK} is 'false'.
readonly host_pretrain="${host_datadir}/pretrain"
readonly container_pretrain="${container_datadir}/pretrain"
readonly pretrain_mount="${host_pretrain}:${container_pretrain}"
# Replace these with where you want to store the pretrained checkpoints on 
# the system.
readonly host_output="$PWD/results/${SLURM_JOB_ID}"
mkdir -p "${host_output}"
readonly container_output="/results"
readonly output_mount="${host_output}:${container_output}"
# If INIT_CHECKPOINT is 'none', infer INIT_CHECKPOINT based on job dependency.
if [ "${INIT_CHECKPOINT}" == "none" ] && [ "${PHASE}" == "2" ] ; then
  INIT_CHECKPOINT="$PWD/results/${SLURM_JOB_DEPENDENCY}/ckpt_7038.pt"
fi
# Define mounts.
mounts="${wikipedia_mount},${pretrain_mount},${output_mount}"
# Add the mount path of the initial checkpoint for Phase2.
if [ "${PHASE}" == "1" ]; then
  echo "No init. mounted for Phase1!"
  readonly container_init_checkpoint=""
elif [ "${PHASE}" == "2" ]; then
  if [ ! -f "${INIT_CHECKPOINT}" ]; then
    echo "No init. checkpoint found for Phase2!"
    exit 1
  else
    mounts="${mounts},$(dirname "${INIT_CHECKPOINT}"):/checkpoints"
    readonly container_init_checkpoint="/checkpoints/$(basename "${INIT_CHECKPOINT}")"
  fi
else
  echo "\${PHASE} = ${PHASE} unknown!"
  exit 1
fi
# Determine where the parquet shards should be stored.
if [ "${RERUN_DASK}" == "true" ]; then
  # Always rerun the dask pipeline. Therefore, use the output directory to store
  # the parquets.
  readonly host_pretrain_parquet="${host_output}/parquet"
  readonly container_pretrain_parquet="${container_output}/parquet"
elif [ "${RERUN_DASK}" == "false" ]; then
  echo "Use existing parquets if they exists."
  if [ "${BIN_SIZE}" == "none" ]; then
      readonly host_pretrain_parquet="${host_pretrain}/phase${PHASE}/unbinned/parquet"
      readonly container_pretrain_parquet="${container_pretrain}/phase${PHASE}/unbinned/parquet"
  else
      readonly host_pretrain_parquet="${host_pretrain}/phase${PHASE}/bin_size_${BIN_SIZE}/parquet"
      readonly container_pretrain_parquet="${container_pretrain}/phase${PHASE}/bin_size_${BIN_SIZE}/parquet"
  fi
else
  echo "\${RERUN_DASK} = ${RERUN_DASK} unknown!"
  exit 1
fi

# 
# Determine the pretraining arguments.
#
# Should we exit pretraining early?
if [ "${STEPS_THIS_RUN}" == "none" ]; then
  readonly steps_this_run_flag=""
else
  readonly steps_this_run_flag="--steps_this_run ${STEPS_THIS_RUN}"
fi

# 
# Determine the pretraining command.
#
# CPU-GPU binding.
readonly BIND_CMD="./bind.sh --cpu=exclusive --ib=single --"
# Arguments that are specific to Phase1 and Phase2.
readonly PHASE1="\
    --learning_rate=6e-3 \
    --warmup_proportion=0.2843 \
    --max_seq_length=128 \
    --max_predictions_per_seq=20 \
    --max_steps=7038 \
    --num_steps_per_checkpoint=2500 \
    "
readonly PHASE2="\
    --learning_rate=4e-3 \
    --warmup_proportion=0.128 \
    --phase2 \
    --max_seq_length=512 \
    --max_predictions_per_seq=80 \
    --max_steps=1563 \
    --num_steps_per_checkpoint=1000 \
    --resume_from_checkpoint --phase1_end_step=7038 \
    --init_checkpoint=${container_init_checkpoint} \
    "
# Arguments for fp16.
if [ "${PRECISION}" == "fp16" ]; then
  readonly fp16_flags="--fp16 --allreduce_post_accumulation_fp16"
elif [ "${PRECISION}" == "tf32" ]; then
  readonly fp16_flags=""
else
  echo "\${PRECISION} = ${PRECISION} unknown!"
  exit 1
fi
# Get the actual pretraining command.
readonly PHASES=( "$PHASE1" "$PHASE2" ) 
readonly BERT_CMD="\
    ${BIND_CMD} python -u /workspace/bert/run_pretraining.py \
    --seed=${SEED} \
    --train_batch_size=${TRAIN_BATCH_SIZE} \
    ${PHASES[$((PHASE - 1))]} \
    --do_train \
    --config_file=/workspace/bert/bert_configs/${CONFIG}.json \
    --input_dir=${container_pretrain_parquet} \
    --vocab_file=/workspace/bert/vocab/vocab \
    --output_dir=${container_output} \
    ${fp16_flags} \
    --allreduce_post_accumulation \
    --gradient_accumulation_steps=${GRADIENT_ACCUMULATION_STEPS} \
    --log_freq=1 \
    --json-summary=${container_output}/summary.json \
    --disable_progress_bar \
    --num_workers=${NUM_WORKERS} \
    ${steps_this_run_flag} \
    --local_rank=\${SLURM_LOCALID} "

echo "nodes: ${SLURM_JOB_NUM_NODES}, TRAIN_BATCH_SIZE: ${TRAIN_BATCH_SIZE}, GRADIENT_ACCUMULATION_STEPS: ${GRADIENT_ACCUMULATION_STEPS}"

#
# Running the LDDL preprocessor and load balancer.
# 
# Determine the number of parquet shards in total.
if [ "${NUM_SHARDS_PER_WORKER}" == "none" ]; then
  readonly num_blocks=4096
else
  readonly num_blocks=$((NUM_SHARDS_PER_WORKER * $(( NUM_WORKERS > 0 ? NUM_WORKERS : 1 )) * SLURM_JOB_NUM_NODES * GPUS))
fi
echo "num_blocks: ${num_blocks}"
# Run the LDDL preprocessor and load balancer only when there is no file in 
# where the parquets are supposed to be stored.
if [ ! -d "${host_pretrain_parquet}" ] || [ -z "$(ls -A "${host_pretrain_parquet}")" ]; then
  # The sequence length is 128 for Phase1, but 512 for Phase2.
  if [ "${PHASE}" == "1" ]; then
    readonly target_seq_len_flag=""
  elif [ "${PHASE}" == "2" ]; then
    readonly target_seq_len_flag="--target-seq-length 512"
  else
    echo "\${PHASE} = ${PHASE} unknown!"
    exit 1
  fi
  # Should we use sequence binning?
  if [ "${BIN_SIZE}" == "none" ]; then
    readonly bin_size_flag=""
  else
    readonly bin_size_flag="--bin-size ${BIN_SIZE}"
  fi
  # Static masking or dynamic masking?
  if [ "${MASKING}" == "dynamic" ]; then
    readonly masking_flag=""
  elif [ "${MASKING}" == "static" ]; then
    readonly masking_flag="--masking"
  else
    echo "\${MASKING} = ${MASKING} unknown!"
    exit 1
  fi
  # Should we use jemalloc for the LDDL preprocessor?
  if [ "${USE_JEMALLOC}" == "true" ]; then
    readonly use_jemalloc_flag="--export=ALL,LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so"
  elif [ "${USE_JEMALLOC}" == "false" ]; then
    readonly use_jemalloc_flag=""
  else
    echo "\${USE_JEMALLOC} = ${USE_JEMALLOC} unknown!"
    exit 1
  fi
  # Run the LDDL preprocessor.
  srun -l \
    --mpi=pmix \
    --container-image="${docker_image}" \
    --container-mounts="${mounts}"  \
    --ntasks-per-node="${DASK_TASKS_PER_NODE}" \
    ${use_jemalloc_flag} \
    preprocess_bert_pretrain \
      --schedule mpi \
      ${target_seq_len_flag} \
      --wikipedia ${container_wikipedia_source} \
      --sink "${container_pretrain_parquet}" \
      --vocab-file /workspace/bert/vocab/vocab \
      --num-blocks "${num_blocks}" \
      --sample-ratio "${SAMPLE_RATIO}" \
      ${bin_size_flag} \
      ${masking_flag} \
      --seed "${SEED}"
  # Run the LDDL load balancer.
  srun -l \
    --mpi=pmix \
    --container-image="${docker_image}" \
    --container-mounts="${mounts}"  \
    --ntasks-per-node="${DASK_TASKS_PER_NODE}" \
    balance_dask_output \
      --indir "${container_pretrain_parquet}" \
      --num-shards "${num_blocks}"
fi

# 
# Run pretraining.
#
srun -l --container-image="${docker_image}" --container-mounts="${mounts}" --ntasks-per-node="${GPUS}" bash -c "${BERT_CMD}"