| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268 |
- #!/bin/bash
- #SBATCH --exclusive
- #SBATCH --mem=0
- #SBATCH --overcommit
- #SBATCH --parsable
- # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- set -eux
- #
- # Job Configurations
- #
- # Tag to the built image.
- IMAGE_VERSION=${IMAGE_VERSION:-"22.12-py3"}
- # Number of processes per node used for the LDDL preprocessor.
- DASK_TASKS_PER_NODE=${DASK_TASKS_PER_NODE:-128}
- # 1 or 2 .
- PHASE=${PHASE:-1}
- # An integer that specifies the pretraining seed.
- SEED=${SEED:-42}
- # The percentage of the articles from the Wikipedia dataset to sample and used
- # for pretraining. 0 < ${SAMPLE_RATIO} < 1.0
- SAMPLE_RATIO=${SAMPLE_RATIO:-0.9}
- # Number of GPUs per node. 0 < ${GPUS} <= 8.
- GPUS=${GPUS:-"8"}
- # The bin size for binned LDDL data loading. 'none' or an integer that divides
- # 128 (for Phase1) or 512 (for Phase2).
- BIN_SIZE=${BIN_SIZE:-"none"}
- # Number of parquet shards per each LDDL data loader worker process. 'none' or
- # an integer.
- NUM_SHARDS_PER_WORKER=${NUM_SHARDS_PER_WORKER:-"none"}
- # Number of LDDL data loader worker processes per rank.
- NUM_WORKERS=${NUM_WORKERS:-4}
- # Should we rerun the LDDL preprocessor every time? 'true' or 'false' .
- RERUN_DASK=${RERUN_DASK:-"true"}
- # 'static' or 'dynamic' .
- MASKING=${MASKING:-"static"}
- # Should we use jemalloc for the LDDL preprocessor? 'true' or 'false' .
- USE_JEMALLOC=${USE_JEMALLOC:-"true"}
- # 'fp16' or 'tf32' .
- PRECISION=${PRECISION:-"fp16"}
- # The path to the initial checkpoint (from Phase1) used to start Phase2. 'none'
- # or an absolute path.
- INIT_CHECKPOINT=${INIT_CHECKPOINT:-"none"}
- # The per-rank batch size before being divided by the gradient accumulation
- # steps.
- TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-"256"}
- # The gradient accumulation steps.
- GRADIENT_ACCUMULATION_STEPS=${GRADIENT_ACCUMULATION_STEPS:-"32"}
- #
- # Static Configurations
- #
- # Container URL.
- # Replace this with the URL of the docker image that you build
- # with scripts/docker/build.sh .
- readonly docker_image="bert:${IMAGE_VERSION}"
- # Where the datasets are stored on the system.
- readonly host_datadir="/home/${USER}/datasets"
- readonly container_datadir="/datasets"
- # Replace these with the path to the 'source' subdirectory of the LDDL Wikipedia
- # dataset.
- readonly host_wikipedia_source="${host_datadir}/wikipedia/source"
- readonly container_wikipedia_source="${container_datadir}/wikipedia/source"
- readonly wikipedia_mount="${host_wikipedia_source}:${container_wikipedia_source}"
- # Replace these with where you want to store the Parquet shards in case
- # ${RERUN_DASK} is 'false'.
- readonly host_pretrain="${host_datadir}/pretrain"
- readonly container_pretrain="${container_datadir}/pretrain"
- readonly pretrain_mount="${host_pretrain}:${container_pretrain}"
- # Replace these with where you want to store the pretrained checkpoints on
- # the system.
- readonly host_output="$PWD/results/${SLURM_JOB_ID}"
- mkdir -p "${host_output}"
- readonly container_output="/results"
- readonly output_mount="${host_output}:${container_output}"
- # If INIT_CHECKPOINT is 'none', infer INIT_CHECKPOINT based on job dependency.
- if [ "${INIT_CHECKPOINT}" == "none" ] && [ "${PHASE}" == "2" ] ; then
- INIT_CHECKPOINT="$PWD/results/${SLURM_JOB_DEPENDENCY}/bert-large-uncased/phase1/7038"
- fi
- # Define mounts.
- mounts="${PWD}:/workspace/bert,${wikipedia_mount},${pretrain_mount},${output_mount}"
- # Add the mount path of the initial checkpoint for Phase2.
- if [ "${PHASE}" == "1" ]; then
- echo "No init. mounted for Phase1!"
- readonly container_init_checkpoint=""
- elif [ "${PHASE}" == "2" ]; then
- if [ ! -f "${INIT_CHECKPOINT}" ]; then
- echo "No init. checkpoint found for Phase2!"
- exit 1
- else
- mounts="${mounts},$(dirname "${INIT_CHECKPOINT}"):/checkpoints"
- readonly container_init_checkpoint="/checkpoints"
- fi
- else
- echo "\${PHASE} = ${PHASE} unknown!"
- exit 1
- fi
- # Determine where the parquet shards should be stored.
- if [ "${RERUN_DASK}" == "true" ]; then
- # Always rerun the dask pipeline. Therefore, use the output directory to store
- # the parquets.
- readonly host_pretrain_parquet="${host_output}/parquet"
- readonly container_pretrain_parquet="${container_output}/parquet"
- elif [ "${RERUN_DASK}" == "false" ]; then
- echo "Use existing parquets if they exists."
- if [ "${BIN_SIZE}" == "none" ]; then
- readonly host_pretrain_parquet="${host_pretrain}/phase${PHASE}/unbinned/parquet"
- readonly container_pretrain_parquet="${container_pretrain}/phase${PHASE}/unbinned/parquet"
- else
- readonly host_pretrain_parquet="${host_pretrain}/phase${PHASE}/bin_size_${BIN_SIZE}/parquet"
- readonly container_pretrain_parquet="${container_pretrain}/phase${PHASE}/bin_size_${BIN_SIZE}/parquet"
- fi
- else
- echo "\${RERUN_DASK} = ${RERUN_DASK} unknown!"
- exit 1
- fi
- readonly PHASE1="\
- --learning-rate=6e-3 \
- --warmup-proportion=0.2843 \
- --phase1 \
- --max-seq-length=128 \
- --max-predictions-per-seq=20 \
- --max-steps=7038 \
- --num-steps-per-checkpoint=2500 \
- "
- readonly PHASE2="\
- --learning-rate=4e-3 \
- --warmup-proportion=0.128 \
- --phase2 \
- --max-seq-length=512 \
- --max-predictions-per-seq=80 \
- --max-steps=1563 \
- --num-steps-per-checkpoint=1000 \
- --from-pretrained-params=${container_init_checkpoint} \
- "
- # Arguments for fp16.
- if [ "${PRECISION}" == "fp16" ]; then
- readonly fp16_flags="--amp --use-dynamic-loss-scaling --scale-loss=1048576"
- elif [ "${PRECISION}" == "tf32" ]; then
- readonly fp16_flags=""
- else
- echo "\${PRECISION} = ${PRECISION} unknown!"
- exit 1
- fi
- # Get the ip address of all nodes.
- IP_CMD="hostname -i"
- IP_STR=$(srun -pmix --ntasks-per-node=1 bash -c "${IP_CMD}")
- IP_STR=$(echo $IP_STR | sed 's/ /,/g')
- echo "\${IP_STR} = ${IP_STR}"
- # Get the actual pretraining command.
- readonly PHASES=( "$PHASE1" "$PHASE2" )
- readonly BERT_CMD="\
- python -m paddle.distributed.launch \
- --gpus=0,1,2,3,4,5,6,7 \
- --ips="${IP_STR}" \
- /workspace/bert/run_pretraining.py \
- ${PHASES[$((PHASE - 1))]} \
- --batch-size=${TRAIN_BATCH_SIZE} \
- --input-dir=${container_pretrain_parquet} \
- --output-dir=${container_output} \
- --vocab-file=/workspace/bert/vocab/bert-large-uncased-vocab.txt \
- --bert-model=bert-large-uncased \
- --config-file=/workspace/bert/bert_configs/bert-large-uncased.json \
- --gradient-merge-steps=${GRADIENT_ACCUMULATION_STEPS} \
- --log-freq=1 \
- --seed=12345 \
- --optimizer=Lamb \
- ${fp16_flags} "
- echo "nodes: ${SLURM_JOB_NUM_NODES}, TRAIN_BATCH_SIZE: ${TRAIN_BATCH_SIZE}, GRADIENT_ACCUMULATION_STEPS: ${GRADIENT_ACCUMULATION_STEPS}"
- #
- # Running the LDDL preprocessor and load balancer.
- #
- # Determine the number of parquet shards in total.
- if [ "${NUM_SHARDS_PER_WORKER}" == "none" ]; then
- readonly num_blocks=4096
- else
- readonly num_blocks=$((NUM_SHARDS_PER_WORKER * $(( NUM_WORKERS > 0 ? NUM_WORKERS : 1 )) * SLURM_JOB_NUM_NODES * GPUS))
- fi
- echo "num_blocks: ${num_blocks}"
- # Run the LDDL preprocessor and load balancer only when there is no file in
- # where the parquets are supposed to be stored.
- if [ ! -d "${host_pretrain_parquet}" ] || [ -z "$(ls -A "${host_pretrain_parquet}")" ]; then
- # The sequence length is 128 for Phase1, but 512 for Phase2.
- if [ "${PHASE}" == "1" ]; then
- readonly target_seq_len_flag=""
- elif [ "${PHASE}" == "2" ]; then
- readonly target_seq_len_flag="--target-seq-length 512"
- else
- echo "\${PHASE} = ${PHASE} unknown!"
- exit 1
- fi
- # Should we use sequence binning?
- if [ "${BIN_SIZE}" == "none" ]; then
- readonly bin_size_flag=""
- else
- readonly bin_size_flag="--bin-size ${BIN_SIZE}"
- fi
- # Static masking or dynamic masking?
- if [ "${MASKING}" == "dynamic" ]; then
- readonly masking_flag=""
- elif [ "${MASKING}" == "static" ]; then
- readonly masking_flag="--masking"
- else
- echo "\${MASKING} = ${MASKING} unknown!"
- exit 1
- fi
- # Should we use jemalloc for the LDDL preprocessor?
- if [ "${USE_JEMALLOC}" == "true" ]; then
- readonly use_jemalloc_flag="--export=ALL,LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so"
- elif [ "${USE_JEMALLOC}" == "false" ]; then
- readonly use_jemalloc_flag=""
- else
- echo "\${USE_JEMALLOC} = ${USE_JEMALLOC} unknown!"
- exit 1
- fi
- # Run the LDDL preprocessor.
- srun -l \
- --mpi=pmix \
- --container-image="${docker_image}" \
- --container-mounts="${mounts}" \
- --ntasks-per-node="${DASK_TASKS_PER_NODE}" \
- ${use_jemalloc_flag} \
- preprocess_bert_pretrain \
- --schedule mpi \
- ${target_seq_len_flag} \
- --wikipedia ${container_wikipedia_source} \
- --sink "${container_pretrain_parquet}" \
- --vocab-file /workspace/bert/vocab/bert-large-uncased-vocab.txt \
- --num-blocks "${num_blocks}" \
- --sample-ratio "${SAMPLE_RATIO}" \
- ${bin_size_flag} \
- ${masking_flag} \
- --seed "${SEED}"
- # Run the LDDL load balancer.
- srun -l \
- --mpi=pmix \
- --container-image="${docker_image}" \
- --container-mounts="${mounts}" \
- --ntasks-per-node="${DASK_TASKS_PER_NODE}" \
- balance_dask_output \
- --indir "${container_pretrain_parquet}" \
- --num-shards "${num_blocks}"
- fi
- #
- # Run pretraining.
- #
- srun -l -pmix --container-image="${docker_image}" --container-mounts="${mounts}" --ntasks-per-node=1 bash -c "${BERT_CMD}"
|