run.sub 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/bin/bash
  2. #SBATCH --exclusive
  3. #SBATCH --mem=0
  4. #SBATCH --overcommit
  5. # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. set -eux
  18. # The following variables variables need to be set
  19. # Base container to be used - container built in step 1 on quick start guide
  20. readonly docker_image="nvcr.io/nvidia/pytorch:20.06-py3"
  21. # Location of dataset for phase 1
  22. readonly datadir="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
  23. # Location of dataset for phase 2
  24. readonly datadir_phase2="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_512_pred_80_dupe_5/training"
  25. # Path to where trained checkpoints will be saved on the system
  26. readonly checkpointdir="$PWD/checkpoints"
  27. readonly mounts=".:/workspace/bert,${datadir}:/workspace/data,${datadir_phase2}:/workspace/data_phase2,${checkpointdir}:/results"
  28. BIND_CMD="./bind.sh --cpu=exclusive --ib=single --"
  29. srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}"
  30. PHASE1="\
  31. --train_batch_size=${BATCHSIZE:-16} \
  32. --learning_rate=${LR:-6e-3} \
  33. --warmup_proportion=${WARMUP_UPDATES:-0.2843} \
  34. --input_dir=/workspace/data \
  35. --max_seq_length=128 \
  36. --max_predictions_per_seq=20 \
  37. --max_steps=7038 \
  38. --num_steps_per_checkpoint=2500 \
  39. "
  40. PHASE2="\
  41. --train_batch_size=${BATCHSIZE:-4096} \
  42. --learning_rate=${LR:-4e-3} \
  43. --warmup_proportion=${WARMUP_UPDATES:-0.128} \
  44. --input_dir=/workspace/data_phase2 \
  45. --phase2 \
  46. --max_seq_length=512 \
  47. --max_predictions_per_seq=80 \
  48. --max_steps=1563 \
  49. --num_steps_per_checkpoint=1000 \
  50. --resume_from_checkpoint --phase1_end_step=7038 \
  51. "
  52. PHASES=( "$PHASE1" "$PHASE2" )
  53. PHASE=${PHASE:-1}
  54. BERT_CMD="\
  55. ${BIND_CMD} python -u /workspace/bert/run_pretraining.py \
  56. --seed=42 \
  57. ${PHASES[$((PHASE-1))]} \
  58. --do_train \
  59. --config_file=/workspace/bert/bert_config.json \
  60. --output_dir=/results \
  61. --fp16 \
  62. --allreduce_post_accumulation --allreduce_post_accumulation_fp16 \
  63. --gradient_accumulation_steps=${GRADIENT_STEPS:-2} \
  64. --log_freq=1 \
  65. --local_rank=\${SLURM_LOCALID}"
  66. srun -l --container-image="${docker_image}" --container-mounts="${mounts}" sh -c "${BERT_CMD}"