run.sub 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. #!/bin/bash
  2. #SBATCH --exclusive
  3. #SBATCH --mem=0
  4. #SBATCH --overcommit
  5. #SBATCH --parsable
  6. # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
  7. # Licensed under the Apache License, Version 2.0 (the "License");
  8. # you may not use this file except in compliance with the License.
  9. # You may obtain a copy of the License at
  10. #
  11. # http://www.apache.org/licenses/LICENSE-2.0
  12. #
  13. # Unless required by applicable law or agreed to in writing, software
  14. # distributed under the License is distributed on an "AS IS" BASIS,
  15. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. # See the License for the specific language governing permissions and
  17. # limitations under the License.
  18. set -eux
  19. #
  20. # Job Configurations
  21. #
  22. # Tag to the built image.
  23. IMAGE_VERSION=${IMAGE_VERSION:-"21.11-py3"}
  24. # Number of processes per node used for the LDDL preprocessor.
  25. DASK_TASKS_PER_NODE=${DASK_TASKS_PER_NODE:-128}
  26. # 1 or 2 .
  27. PHASE=${PHASE:-1}
  28. # An integer that specifies the pretraining seed.
  29. SEED=${SEED:-42}
  30. # The percentage of the articles from the Wikipedia dataset to sample and used
  31. # for pretraining. 0 < ${SAMPLE_RATIO} < 1.0
  32. SAMPLE_RATIO=${SAMPLE_RATIO:-0.9}
  33. # How many global steps to run before ending the pretraining job. This argument
  34. # does not impact the learning rate schedule, but only if the pretraining job
  35. # should exit early. 'none' or an integer.
  36. STEPS_THIS_RUN=${STEPS_THIS_RUN:-"none"}
  37. # Number of GPUs per node. 0 < ${GPUS} <= 8.
  38. GPUS=${GPUS:-"8"}
  39. # The bin size for binned LDDL data loading. 'none' or an integer that divides
  40. # 128 (for Phase1) or 512 (for Phase2).
  41. BIN_SIZE=${BIN_SIZE:-"64"}
  42. # Number of parquet shards per each LDDL data loader worker process. 'none' or
  43. # an integer.
  44. NUM_SHARDS_PER_WORKER=${NUM_SHARDS_PER_WORKER:-"none"}
  45. # Number of LDDL data loader worker processes per rank.
  46. NUM_WORKERS=${NUM_WORKERS:-4}
  47. # Should we rerun the LDDL preprocessor every time? 'true' or 'false' .
  48. RERUN_DASK=${RERUN_DASK:-"true"}
  49. # 'static' or 'dynamic' .
  50. MASKING=${MASKING:-"static"}
  51. # Should we use jemalloc for the LDDL preprocessor? 'true' or 'false' .
  52. USE_JEMALLOC=${USE_JEMALLOC:-"true"}
  53. # 'fp16' or 'tf32' .
  54. PRECISION=${PRECISION:-"fp16"}
  55. # 'base' or 'large' .
  56. CONFIG=${CONFIG:-"large"}
  57. # The path to the initial checkpoint (from Phase1) used to start Phase2. 'none'
  58. # or an absolute path.
  59. INIT_CHECKPOINT=${INIT_CHECKPOINT:-"none"}
  60. # The per-rank batch size before being divided by the gradient accumulation
  61. # steps.
  62. TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-"8192"}
  63. # The gradient accumulation steps.
  64. GRADIENT_ACCUMULATION_STEPS=${GRADIENT_ACCUMULATION_STEPS:-"32"}
  65. #
  66. # Static Configurations
  67. #
  68. # Container URL.
  69. # Replace this with the URL of the docker image that you build
  70. # with scripts/docker/build.sh .
  71. readonly docker_image="bert:${IMAGE_VERSION}"
  72. # Where the datasets are stored on the system.
  73. readonly host_datadir="/home/${USER}/datasets"
  74. readonly container_datadir="/datasets"
  75. # Replace these with the path to the 'source' subdirectory of the LDDL Wikipedia
  76. # dataset.
  77. readonly host_wikipedia_source="${host_datadir}/wikipedia/source"
  78. readonly container_wikipedia_source="${container_datadir}/wikipedia/source"
  79. readonly wikipedia_mount="${host_wikipedia_source}:${container_wikipedia_source}"
  80. # Replace these with where you want to store the Parquet shards in case
  81. # ${RERUN_DASK} is 'false'.
  82. readonly host_pretrain="${host_datadir}/pretrain"
  83. readonly container_pretrain="${container_datadir}/pretrain"
  84. readonly pretrain_mount="${host_pretrain}:${container_pretrain}"
  85. # Replace these with where you want to store the pretrained checkpoints on
  86. # the system.
  87. readonly host_output="$PWD/results/${SLURM_JOB_ID}"
  88. mkdir -p "${host_output}"
  89. readonly container_output="/results"
  90. readonly output_mount="${host_output}:${container_output}"
  91. # If INIT_CHECKPOINT is 'none', infer INIT_CHECKPOINT based on job dependency.
  92. if [ "${INIT_CHECKPOINT}" == "none" ] && [ "${PHASE}" == "2" ] ; then
  93. INIT_CHECKPOINT="$PWD/results/${SLURM_JOB_DEPENDENCY}/ckpt_7038.pt"
  94. fi
  95. # Define mounts.
  96. mounts="${wikipedia_mount},${pretrain_mount},${output_mount}"
  97. # Add the mount path of the initial checkpoint for Phase2.
  98. if [ "${PHASE}" == "1" ]; then
  99. echo "No init. mounted for Phase1!"
  100. readonly container_init_checkpoint=""
  101. elif [ "${PHASE}" == "2" ]; then
  102. if [ ! -f "${INIT_CHECKPOINT}" ]; then
  103. echo "No init. checkpoint found for Phase2!"
  104. exit 1
  105. else
  106. mounts="${mounts},$(dirname "${INIT_CHECKPOINT}"):/checkpoints"
  107. readonly container_init_checkpoint="/checkpoints/$(basename "${INIT_CHECKPOINT}")"
  108. fi
  109. else
  110. echo "\${PHASE} = ${PHASE} unknown!"
  111. exit 1
  112. fi
  113. # Determine where the parquet shards should be stored.
  114. if [ "${RERUN_DASK}" == "true" ]; then
  115. # Always rerun the dask pipeline. Therefore, use the output directory to store
  116. # the parquets.
  117. readonly host_pretrain_parquet="${host_output}/parquet"
  118. readonly container_pretrain_parquet="${container_output}/parquet"
  119. elif [ "${RERUN_DASK}" == "false" ]; then
  120. echo "Use existing parquets if they exists."
  121. if [ "${BIN_SIZE}" == "none" ]; then
  122. readonly host_pretrain_parquet="${host_pretrain}/phase${PHASE}/unbinned/parquet"
  123. readonly container_pretrain_parquet="${container_pretrain}/phase${PHASE}/unbinned/parquet"
  124. else
  125. readonly host_pretrain_parquet="${host_pretrain}/phase${PHASE}/bin_size_${BIN_SIZE}/parquet"
  126. readonly container_pretrain_parquet="${container_pretrain}/phase${PHASE}/bin_size_${BIN_SIZE}/parquet"
  127. fi
  128. else
  129. echo "\${RERUN_DASK} = ${RERUN_DASK} unknown!"
  130. exit 1
  131. fi
  132. #
  133. # Determine the pretraining arguments.
  134. #
  135. # Should we exit pretraining early?
  136. if [ "${STEPS_THIS_RUN}" == "none" ]; then
  137. readonly steps_this_run_flag=""
  138. else
  139. readonly steps_this_run_flag="--steps_this_run ${STEPS_THIS_RUN}"
  140. fi
  141. #
  142. # Determine the pretraining command.
  143. #
  144. # CPU-GPU binding.
  145. readonly BIND_CMD="./bind.sh --cpu=exclusive --ib=single --"
  146. # Arguments that are specific to Phase1 and Phase2.
  147. readonly PHASE1="\
  148. --learning_rate=6e-3 \
  149. --warmup_proportion=0.2843 \
  150. --max_seq_length=128 \
  151. --max_predictions_per_seq=20 \
  152. --max_steps=7038 \
  153. --num_steps_per_checkpoint=2500 \
  154. "
  155. readonly PHASE2="\
  156. --learning_rate=4e-3 \
  157. --warmup_proportion=0.128 \
  158. --phase2 \
  159. --max_seq_length=512 \
  160. --max_predictions_per_seq=80 \
  161. --max_steps=1563 \
  162. --num_steps_per_checkpoint=1000 \
  163. --resume_from_checkpoint --phase1_end_step=7038 \
  164. --init_checkpoint=${container_init_checkpoint} \
  165. "
  166. # Arguments for fp16.
  167. if [ "${PRECISION}" == "fp16" ]; then
  168. readonly fp16_flags="--fp16 --allreduce_post_accumulation_fp16"
  169. elif [ "${PRECISION}" == "tf32" ]; then
  170. readonly fp16_flags=""
  171. else
  172. echo "\${PRECISION} = ${PRECISION} unknown!"
  173. exit 1
  174. fi
  175. # Get the actual pretraining command.
  176. readonly PHASES=( "$PHASE1" "$PHASE2" )
  177. readonly BERT_CMD="\
  178. ${BIND_CMD} python -u /workspace/bert/run_pretraining.py \
  179. --seed=${SEED} \
  180. --train_batch_size=${TRAIN_BATCH_SIZE} \
  181. ${PHASES[$((PHASE - 1))]} \
  182. --do_train \
  183. --config_file=/workspace/bert/bert_configs/${CONFIG}.json \
  184. --input_dir=${container_pretrain_parquet} \
  185. --vocab_file=/workspace/bert/vocab/vocab \
  186. --output_dir=${container_output} \
  187. ${fp16_flags} \
  188. --allreduce_post_accumulation \
  189. --gradient_accumulation_steps=${GRADIENT_ACCUMULATION_STEPS} \
  190. --log_freq=1 \
  191. --json-summary=${container_output}/summary.json \
  192. --disable_progress_bar \
  193. --num_workers=${NUM_WORKERS} \
  194. ${steps_this_run_flag} \
  195. --local_rank=\${SLURM_LOCALID} "
  196. echo "nodes: ${SLURM_JOB_NUM_NODES}, TRAIN_BATCH_SIZE: ${TRAIN_BATCH_SIZE}, GRADIENT_ACCUMULATION_STEPS: ${GRADIENT_ACCUMULATION_STEPS}"
  197. #
  198. # Running the LDDL preprocessor and load balancer.
  199. #
  200. # Determine the number of parquet shards in total.
  201. if [ "${NUM_SHARDS_PER_WORKER}" == "none" ]; then
  202. readonly num_blocks=4096
  203. else
  204. readonly num_blocks=$((NUM_SHARDS_PER_WORKER * $(( NUM_WORKERS > 0 ? NUM_WORKERS : 1 )) * SLURM_JOB_NUM_NODES * GPUS))
  205. fi
  206. echo "num_blocks: ${num_blocks}"
  207. # Run the LDDL preprocessor and load balancer only when there is no file in
  208. # where the parquets are supposed to be stored.
  209. if [ ! -d "${host_pretrain_parquet}" ] || [ -z "$(ls -A "${host_pretrain_parquet}")" ]; then
  210. # The sequence length is 128 for Phase1, but 512 for Phase2.
  211. if [ "${PHASE}" == "1" ]; then
  212. readonly target_seq_len_flag=""
  213. elif [ "${PHASE}" == "2" ]; then
  214. readonly target_seq_len_flag="--target-seq-length 512"
  215. else
  216. echo "\${PHASE} = ${PHASE} unknown!"
  217. exit 1
  218. fi
  219. # Should we use sequence binning?
  220. if [ "${BIN_SIZE}" == "none" ]; then
  221. readonly bin_size_flag=""
  222. else
  223. readonly bin_size_flag="--bin-size ${BIN_SIZE}"
  224. fi
  225. # Static masking or dynamic masking?
  226. if [ "${MASKING}" == "dynamic" ]; then
  227. readonly masking_flag=""
  228. elif [ "${MASKING}" == "static" ]; then
  229. readonly masking_flag="--masking"
  230. else
  231. echo "\${MASKING} = ${MASKING} unknown!"
  232. exit 1
  233. fi
  234. # Should we use jemalloc for the LDDL preprocessor?
  235. if [ "${USE_JEMALLOC}" == "true" ]; then
  236. readonly use_jemalloc_flag="--export=ALL,LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so"
  237. elif [ "${USE_JEMALLOC}" == "false" ]; then
  238. readonly use_jemalloc_flag=""
  239. else
  240. echo "\${USE_JEMALLOC} = ${USE_JEMALLOC} unknown!"
  241. exit 1
  242. fi
  243. # Run the LDDL preprocessor.
  244. srun -l \
  245. --mpi=pmix \
  246. --container-image="${docker_image}" \
  247. --container-mounts="${mounts}" \
  248. --ntasks-per-node="${DASK_TASKS_PER_NODE}" \
  249. ${use_jemalloc_flag} \
  250. preprocess_bert_pretrain \
  251. --schedule mpi \
  252. ${target_seq_len_flag} \
  253. --wikipedia ${container_wikipedia_source} \
  254. --sink "${container_pretrain_parquet}" \
  255. --vocab-file /workspace/bert/vocab/vocab \
  256. --num-blocks "${num_blocks}" \
  257. --sample-ratio "${SAMPLE_RATIO}" \
  258. ${bin_size_flag} \
  259. ${masking_flag} \
  260. --seed "${SEED}"
  261. # Run the LDDL load balancer.
  262. srun -l \
  263. --mpi=pmix \
  264. --container-image="${docker_image}" \
  265. --container-mounts="${mounts}" \
  266. --ntasks-per-node="${DASK_TASKS_PER_NODE}" \
  267. balance_dask_output \
  268. --indir "${container_pretrain_parquet}" \
  269. --num-shards "${num_blocks}"
  270. fi
  271. #
  272. # Run pretraining.
  273. #
  274. srun -l --container-image="${docker_image}" --container-mounts="${mounts}" --ntasks-per-node="${GPUS}" bash -c "${BERT_CMD}"