| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576 |
- #!/bin/bash
- #SBATCH --exclusive
- #SBATCH --mem=0
- #SBATCH --overcommit
- # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- set -eux
- # The following variables variables need to be set
- # Base container to be used - container built in step 1 on quick start guide
- readonly docker_image="nvcr.io/nvidia/pytorch:20.06-py3"
- # Location of dataset for phase 1
- readonly datadir="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
- # Location of dataset for phase 2
- readonly datadir_phase2="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_512_pred_80_dupe_5/training"
- # Path to where trained checkpoints will be saved on the system
- readonly checkpointdir="$PWD/checkpoints"
- readonly mounts=".:/workspace/bert,${datadir}:/workspace/data,${datadir_phase2}:/workspace/data_phase2,${checkpointdir}:/results"
- BIND_CMD="./bind.sh --cpu=exclusive --ib=single --"
- srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}"
- PHASE1="\
- --train_batch_size=${BATCHSIZE:-16} \
- --learning_rate=${LR:-6e-3} \
- --warmup_proportion=${WARMUP_UPDATES:-0.2843} \
- --input_dir=/workspace/data \
- --max_seq_length=128 \
- --max_predictions_per_seq=20 \
- --max_steps=7038 \
- --num_steps_per_checkpoint=2500 \
- "
- PHASE2="\
- --train_batch_size=${BATCHSIZE:-4096} \
- --learning_rate=${LR:-4e-3} \
- --warmup_proportion=${WARMUP_UPDATES:-0.128} \
- --input_dir=/workspace/data_phase2 \
- --phase2 \
- --max_seq_length=512 \
- --max_predictions_per_seq=80 \
- --max_steps=1563 \
- --num_steps_per_checkpoint=1000 \
- --resume_from_checkpoint --phase1_end_step=7038 \
- "
- PHASES=( "$PHASE1" "$PHASE2" )
- PHASE=${PHASE:-1}
- BERT_CMD="\
- ${BIND_CMD} python -u /workspace/bert/run_pretraining.py \
- --seed=42 \
- ${PHASES[$((PHASE-1))]} \
- --do_train \
- --config_file=/workspace/bert/bert_config.json \
- --output_dir=/results \
- --fp16 \
- --allreduce_post_accumulation --allreduce_post_accumulation_fp16 \
- --gradient_accumulation_steps=${GRADIENT_STEPS:-2} \
- --log_freq=1 \
- --local_rank=\${SLURM_LOCALID}"
- srun -l --container-image="${docker_image}" --container-mounts="${mounts}" sh -c "${BERT_CMD}"
|