run.sub 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. #!/bin/bash
  2. #SBATCH --exclusive
  3. #SBATCH --mem=0
  4. #SBATCH --overcommit
  5. #SBATCH --parsable
  6. # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
  7. # Licensed under the Apache License, Version 2.0 (the "License");
  8. # you may not use this file except in compliance with the License.
  9. # You may obtain a copy of the License at
  10. #
  11. # http://www.apache.org/licenses/LICENSE-2.0
  12. #
  13. # Unless required by applicable law or agreed to in writing, software
  14. # distributed under the License is distributed on an "AS IS" BASIS,
  15. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. # See the License for the specific language governing permissions and
  17. # limitations under the License.
  18. set -eux
  19. #
  20. # Job Configurations
  21. #
  22. # Tag to the built image.
  23. IMAGE_VERSION=${IMAGE_VERSION:-"22.12-py3"}
  24. # Number of processes per node used for the LDDL preprocessor.
  25. DASK_TASKS_PER_NODE=${DASK_TASKS_PER_NODE:-128}
  26. # 1 or 2 .
  27. PHASE=${PHASE:-1}
  28. # An integer that specifies the pretraining seed.
  29. SEED=${SEED:-42}
  30. # The percentage of the articles from the Wikipedia dataset to sample and used
  31. # for pretraining. 0 < ${SAMPLE_RATIO} < 1.0
  32. SAMPLE_RATIO=${SAMPLE_RATIO:-0.9}
  33. # Number of GPUs per node. 0 < ${GPUS} <= 8.
  34. GPUS=${GPUS:-"8"}
  35. # The bin size for binned LDDL data loading. 'none' or an integer that divides
  36. # 128 (for Phase1) or 512 (for Phase2).
  37. BIN_SIZE=${BIN_SIZE:-"none"}
  38. # Number of parquet shards per each LDDL data loader worker process. 'none' or
  39. # an integer.
  40. NUM_SHARDS_PER_WORKER=${NUM_SHARDS_PER_WORKER:-"none"}
  41. # Number of LDDL data loader worker processes per rank.
  42. NUM_WORKERS=${NUM_WORKERS:-4}
  43. # Should we rerun the LDDL preprocessor every time? 'true' or 'false' .
  44. RERUN_DASK=${RERUN_DASK:-"true"}
  45. # 'static' or 'dynamic' .
  46. MASKING=${MASKING:-"static"}
  47. # Should we use jemalloc for the LDDL preprocessor? 'true' or 'false' .
  48. USE_JEMALLOC=${USE_JEMALLOC:-"true"}
  49. # 'fp16' or 'tf32' .
  50. PRECISION=${PRECISION:-"fp16"}
  51. # The path to the initial checkpoint (from Phase1) used to start Phase2. 'none'
  52. # or an absolute path.
  53. INIT_CHECKPOINT=${INIT_CHECKPOINT:-"none"}
  54. # The per-rank batch size before being divided by the gradient accumulation
  55. # steps.
  56. TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-"256"}
  57. # The gradient accumulation steps.
  58. GRADIENT_ACCUMULATION_STEPS=${GRADIENT_ACCUMULATION_STEPS:-"32"}
  59. #
  60. # Static Configurations
  61. #
  62. # Container URL.
  63. # Replace this with the URL of the docker image that you build
  64. # with scripts/docker/build.sh .
  65. readonly docker_image="bert:${IMAGE_VERSION}"
  66. # Where the datasets are stored on the system.
  67. readonly host_datadir="/home/${USER}/datasets"
  68. readonly container_datadir="/datasets"
  69. # Replace these with the path to the 'source' subdirectory of the LDDL Wikipedia
  70. # dataset.
  71. readonly host_wikipedia_source="${host_datadir}/wikipedia/source"
  72. readonly container_wikipedia_source="${container_datadir}/wikipedia/source"
  73. readonly wikipedia_mount="${host_wikipedia_source}:${container_wikipedia_source}"
  74. # Replace these with where you want to store the Parquet shards in case
  75. # ${RERUN_DASK} is 'false'.
  76. readonly host_pretrain="${host_datadir}/pretrain"
  77. readonly container_pretrain="${container_datadir}/pretrain"
  78. readonly pretrain_mount="${host_pretrain}:${container_pretrain}"
  79. # Replace these with where you want to store the pretrained checkpoints on
  80. # the system.
  81. readonly host_output="$PWD/results/${SLURM_JOB_ID}"
  82. mkdir -p "${host_output}"
  83. readonly container_output="/results"
  84. readonly output_mount="${host_output}:${container_output}"
  85. # If INIT_CHECKPOINT is 'none', infer INIT_CHECKPOINT based on job dependency.
  86. if [ "${INIT_CHECKPOINT}" == "none" ] && [ "${PHASE}" == "2" ] ; then
  87. INIT_CHECKPOINT="$PWD/results/${SLURM_JOB_DEPENDENCY}/bert-large-uncased/phase1/7038"
  88. fi
  89. # Define mounts.
  90. mounts="${PWD}:/workspace/bert,${wikipedia_mount},${pretrain_mount},${output_mount}"
  91. # Add the mount path of the initial checkpoint for Phase2.
  92. if [ "${PHASE}" == "1" ]; then
  93. echo "No init. mounted for Phase1!"
  94. readonly container_init_checkpoint=""
  95. elif [ "${PHASE}" == "2" ]; then
  96. if [ ! -f "${INIT_CHECKPOINT}" ]; then
  97. echo "No init. checkpoint found for Phase2!"
  98. exit 1
  99. else
  100. mounts="${mounts},$(dirname "${INIT_CHECKPOINT}"):/checkpoints"
  101. readonly container_init_checkpoint="/checkpoints"
  102. fi
  103. else
  104. echo "\${PHASE} = ${PHASE} unknown!"
  105. exit 1
  106. fi
  107. # Determine where the parquet shards should be stored.
  108. if [ "${RERUN_DASK}" == "true" ]; then
  109. # Always rerun the dask pipeline. Therefore, use the output directory to store
  110. # the parquets.
  111. readonly host_pretrain_parquet="${host_output}/parquet"
  112. readonly container_pretrain_parquet="${container_output}/parquet"
  113. elif [ "${RERUN_DASK}" == "false" ]; then
  114. echo "Use existing parquets if they exists."
  115. if [ "${BIN_SIZE}" == "none" ]; then
  116. readonly host_pretrain_parquet="${host_pretrain}/phase${PHASE}/unbinned/parquet"
  117. readonly container_pretrain_parquet="${container_pretrain}/phase${PHASE}/unbinned/parquet"
  118. else
  119. readonly host_pretrain_parquet="${host_pretrain}/phase${PHASE}/bin_size_${BIN_SIZE}/parquet"
  120. readonly container_pretrain_parquet="${container_pretrain}/phase${PHASE}/bin_size_${BIN_SIZE}/parquet"
  121. fi
  122. else
  123. echo "\${RERUN_DASK} = ${RERUN_DASK} unknown!"
  124. exit 1
  125. fi
  126. readonly PHASE1="\
  127. --learning-rate=6e-3 \
  128. --warmup-proportion=0.2843 \
  129. --phase1 \
  130. --max-seq-length=128 \
  131. --max-predictions-per-seq=20 \
  132. --max-steps=7038 \
  133. --num-steps-per-checkpoint=2500 \
  134. "
  135. readonly PHASE2="\
  136. --learning-rate=4e-3 \
  137. --warmup-proportion=0.128 \
  138. --phase2 \
  139. --max-seq-length=512 \
  140. --max-predictions-per-seq=80 \
  141. --max-steps=1563 \
  142. --num-steps-per-checkpoint=1000 \
  143. --from-pretrained-params=${container_init_checkpoint} \
  144. "
  145. # Arguments for fp16.
  146. if [ "${PRECISION}" == "fp16" ]; then
  147. readonly fp16_flags="--amp --use-dynamic-loss-scaling --scale-loss=1048576"
  148. elif [ "${PRECISION}" == "tf32" ]; then
  149. readonly fp16_flags=""
  150. else
  151. echo "\${PRECISION} = ${PRECISION} unknown!"
  152. exit 1
  153. fi
  154. # Get the ip address of all nodes.
  155. IP_CMD="hostname -i"
  156. IP_STR=$(srun -pmix --ntasks-per-node=1 bash -c "${IP_CMD}")
  157. IP_STR=$(echo $IP_STR | sed 's/ /,/g')
  158. echo "\${IP_STR} = ${IP_STR}"
  159. # Get the actual pretraining command.
  160. readonly PHASES=( "$PHASE1" "$PHASE2" )
  161. readonly BERT_CMD="\
  162. python -m paddle.distributed.launch \
  163. --gpus=0,1,2,3,4,5,6,7 \
  164. --ips="${IP_STR}" \
  165. /workspace/bert/run_pretraining.py \
  166. ${PHASES[$((PHASE - 1))]} \
  167. --batch-size=${TRAIN_BATCH_SIZE} \
  168. --input-dir=${container_pretrain_parquet} \
  169. --output-dir=${container_output} \
  170. --vocab-file=/workspace/bert/vocab/bert-large-uncased-vocab.txt \
  171. --bert-model=bert-large-uncased \
  172. --config-file=/workspace/bert/bert_configs/bert-large-uncased.json \
  173. --gradient-merge-steps=${GRADIENT_ACCUMULATION_STEPS} \
  174. --log-freq=1 \
  175. --seed=12345 \
  176. --optimizer=Lamb \
  177. ${fp16_flags} "
  178. echo "nodes: ${SLURM_JOB_NUM_NODES}, TRAIN_BATCH_SIZE: ${TRAIN_BATCH_SIZE}, GRADIENT_ACCUMULATION_STEPS: ${GRADIENT_ACCUMULATION_STEPS}"
  179. #
  180. # Running the LDDL preprocessor and load balancer.
  181. #
  182. # Determine the number of parquet shards in total.
  183. if [ "${NUM_SHARDS_PER_WORKER}" == "none" ]; then
  184. readonly num_blocks=4096
  185. else
  186. readonly num_blocks=$((NUM_SHARDS_PER_WORKER * $(( NUM_WORKERS > 0 ? NUM_WORKERS : 1 )) * SLURM_JOB_NUM_NODES * GPUS))
  187. fi
  188. echo "num_blocks: ${num_blocks}"
  189. # Run the LDDL preprocessor and load balancer only when there is no file in
  190. # where the parquets are supposed to be stored.
  191. if [ ! -d "${host_pretrain_parquet}" ] || [ -z "$(ls -A "${host_pretrain_parquet}")" ]; then
  192. # The sequence length is 128 for Phase1, but 512 for Phase2.
  193. if [ "${PHASE}" == "1" ]; then
  194. readonly target_seq_len_flag=""
  195. elif [ "${PHASE}" == "2" ]; then
  196. readonly target_seq_len_flag="--target-seq-length 512"
  197. else
  198. echo "\${PHASE} = ${PHASE} unknown!"
  199. exit 1
  200. fi
  201. # Should we use sequence binning?
  202. if [ "${BIN_SIZE}" == "none" ]; then
  203. readonly bin_size_flag=""
  204. else
  205. readonly bin_size_flag="--bin-size ${BIN_SIZE}"
  206. fi
  207. # Static masking or dynamic masking?
  208. if [ "${MASKING}" == "dynamic" ]; then
  209. readonly masking_flag=""
  210. elif [ "${MASKING}" == "static" ]; then
  211. readonly masking_flag="--masking"
  212. else
  213. echo "\${MASKING} = ${MASKING} unknown!"
  214. exit 1
  215. fi
  216. # Should we use jemalloc for the LDDL preprocessor?
  217. if [ "${USE_JEMALLOC}" == "true" ]; then
  218. readonly use_jemalloc_flag="--export=ALL,LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so"
  219. elif [ "${USE_JEMALLOC}" == "false" ]; then
  220. readonly use_jemalloc_flag=""
  221. else
  222. echo "\${USE_JEMALLOC} = ${USE_JEMALLOC} unknown!"
  223. exit 1
  224. fi
  225. # Run the LDDL preprocessor.
  226. srun -l \
  227. --mpi=pmix \
  228. --container-image="${docker_image}" \
  229. --container-mounts="${mounts}" \
  230. --ntasks-per-node="${DASK_TASKS_PER_NODE}" \
  231. ${use_jemalloc_flag} \
  232. preprocess_bert_pretrain \
  233. --schedule mpi \
  234. ${target_seq_len_flag} \
  235. --wikipedia ${container_wikipedia_source} \
  236. --sink "${container_pretrain_parquet}" \
  237. --vocab-file /workspace/bert/vocab/bert-large-uncased-vocab.txt \
  238. --num-blocks "${num_blocks}" \
  239. --sample-ratio "${SAMPLE_RATIO}" \
  240. ${bin_size_flag} \
  241. ${masking_flag} \
  242. --seed "${SEED}"
  243. # Run the LDDL load balancer.
  244. srun -l \
  245. --mpi=pmix \
  246. --container-image="${docker_image}" \
  247. --container-mounts="${mounts}" \
  248. --ntasks-per-node="${DASK_TASKS_PER_NODE}" \
  249. balance_dask_output \
  250. --indir "${container_pretrain_parquet}" \
  251. --num-shards "${num_blocks}"
  252. fi
  253. #
  254. # Run pretraining.
  255. #
  256. srun -l -pmix --container-image="${docker_image}" --container-mounts="${mounts}" --ntasks-per-node=1 bash -c "${BERT_CMD}"