run_pretraining.sh 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. # Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. set -ex
  15. echo "Container nvidia build = " $NVIDIA_BUILD_ID
  16. train_batch_size=${1:-256}
  17. learning_rate=${2:-"6e-3"}
  18. precision=${3:-"amp"}
  19. num_gpus=${4:-8}
  20. warmup_proportion=${5:-"0.2843"}
  21. train_steps=${6:-7038}
  22. save_checkpoint_steps=${7:-200}
  23. create_logfile=${8:-"false"}
  24. gradient_accumulation_steps=${9:-32}
  25. seed=${10:-12439}
  26. job_name=${11:-"bert_lamb_pretraining"}
  27. train_batch_size_phase2=${12:-32}
  28. learning_rate_phase2=${13:-"4e-3"}
  29. warmup_proportion_phase2=${14:-"0.128"}
  30. train_steps_phase2=${15:-1563}
  31. gradient_accumulation_steps_phase2=${16:-128}
  32. #change this for other datasets
  33. DATASET=pretrain/phase1/unbinned/parquet
  34. DATA_DIR_PHASE1=${17:-$BERT_PREP_WORKING_DIR/${DATASET}/}
  35. #change this for other datasets
  36. DATASET2=pretrain/phase2/bin_size_64/parquet
  37. DATA_DIR_PHASE2=${18:-$BERT_PREP_WORKING_DIR/${DATASET2}/}
  38. CODEDIR=${19:-"/workspace/bert"}
  39. init_checkpoint=${20:-"None"}
  40. VOCAB_FILE=vocab/bert-large-uncased-vocab.txt
  41. RESULTS_DIR=$CODEDIR/results
  42. CHECKPOINTS_DIR=$RESULTS_DIR
  43. wikipedia_source=${21:-$BERT_PREP_WORKING_DIR/wikipedia/source/}
  44. num_dask_workers=${22:-$(nproc)}
  45. num_shards_per_worker=${23:-128}
  46. num_workers=${24:-4}
  47. num_nodes=1
  48. sample_ratio=${25:-0.9}
  49. phase2_bin_size=${26:-64}
  50. masking=${27:-static}
  51. BERT_CONFIG=${28:-"None"}
  52. enable_benchmark=${29:-"false"}
  53. benchmark_steps=${30:-"10"}
  54. benchmark_warmup_steps=${31:-"10"}
  55. fuse_mha=${32:-"true"}
  56. # Calculate the total number of shards.
  57. readonly num_blocks=$((num_shards_per_worker * $(( num_workers > 0 ? num_workers : 1 )) * num_nodes * num_gpus))
  58. if [ "${phase2_bin_size}" == "none" ]; then
  59. readonly phase2_bin_size_flag=""
  60. elif [[ "${phase2_bin_size}" =~ ^(32|64|128|256|512)$ ]]; then
  61. readonly phase2_bin_size_flag="--bin-size ${phase2_bin_size}"
  62. else
  63. echo "Error! phase2_bin_size=${phase2_bin_size} not supported!"
  64. return -1
  65. fi
  66. if [ "${masking}" == "static" ]; then
  67. readonly masking_flag="--masking"
  68. elif [ "${masking}" == "dynamic" ]; then
  69. readonly masking_flag=""
  70. else
  71. echo "Error! masking=${masking} not supported!"
  72. return -1
  73. fi
  74. mkdir -p $CHECKPOINTS_DIR
  75. if [ ! -d "${DATA_DIR_PHASE1}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE1})" ]; then
  76. echo "Warning! ${DATA_DIR_PHASE1} directory missing."
  77. if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then
  78. echo "Error! ${wikipedia_source} directory missing. Training cannot start!"
  79. return -1
  80. fi
  81. preprocess_cmd=" \
  82. mpirun \
  83. --oversubscribe \
  84. --allow-run-as-root \
  85. -np ${num_dask_workers} \
  86. -x LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so \
  87. preprocess_bert_pretrain \
  88. --schedule mpi \
  89. --vocab-file ${VOCAB_FILE} \
  90. --wikipedia ${wikipedia_source} \
  91. --sink ${DATA_DIR_PHASE1} \
  92. --num-blocks ${num_blocks} \
  93. --sample-ratio ${sample_ratio} \
  94. ${masking_flag} \
  95. --seed ${seed}"
  96. echo "Running ${preprocess_cmd} ..."
  97. ${preprocess_cmd}
  98. balance_load_cmd=" \
  99. mpirun \
  100. --oversubscribe \
  101. --allow-run-as-root \
  102. -np ${num_dask_workers} \
  103. balance_dask_output \
  104. --indir ${DATA_DIR_PHASE1} \
  105. --num-shards ${num_blocks}"
  106. echo "Running ${balance_load_cmd} ..."
  107. ${balance_load_cmd}
  108. fi
  109. if [ ! -d "$RESULTS_DIR" ] ; then
  110. echo "Error! $RESULTS_DIR directory missing."
  111. exit -1
  112. fi
  113. if [ ! -d "$CHECKPOINTS_DIR" ] ; then
  114. echo "Warning! $CHECKPOINTS_DIR directory missing."
  115. echo "Checkpoints will be written to $RESULTS_DIR instead."
  116. CHECKPOINTS_DIR=$RESULTS_DIR
  117. fi
  118. CONFIG=""
  119. if [ "$BERT_CONFIG" != "None" ] ; then
  120. CONFIG="--config-file=$BERT_CONFIG"
  121. fi
  122. PREC=""
  123. FUSE_MHA=""
  124. if [ "$precision" = "amp" ] ; then
  125. PREC="--amp --use-dynamic-loss-scaling --scale-loss=1048576"
  126. if [ "$fuse_mha" = "true" ] ; then
  127. FUSE_MHA="--fuse-mha"
  128. fi
  129. elif [ "$precision" = "fp32" ] ; then
  130. PREC=""
  131. elif [ "$precision" = "tf32" ] ; then
  132. PREC=""
  133. else
  134. echo "Unknown <precision> argument"
  135. exit -2
  136. fi
  137. ACCUMULATE_GRADIENTS="--gradient-merge-steps=$gradient_accumulation_steps"
  138. INIT_CHECKPOINT=""
  139. if [ "$init_checkpoint" != "None" ] ; then
  140. INIT_CHECKPOINT="--from-checkpoint=$init_checkpoint --last-step-of-checkpoint=auto"
  141. fi
  142. BENCH=""
  143. if [ "$enable_benchmark" = "true" ] ; then
  144. BENCH="--benchmark --benchmark-steps=$benchmark_steps --benchmark-warmup-steps=$benchmark_warmup_steps"
  145. fi
  146. unset CUDA_VISIBLE_DEVICES
  147. if [ "$num_gpus" = "1" ] ; then
  148. DIST_CMD="python -m paddle.distributed.launch --gpus=0"
  149. elif [ "$num_gpus" = "2" ] ; then
  150. DIST_CMD="python -m paddle.distributed.launch --gpus=0,1"
  151. elif [ "$num_gpus" = "3" ] ; then
  152. DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2"
  153. elif [ "$num_gpus" = "4" ] ; then
  154. DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3"
  155. elif [ "$num_gpus" = "5" ] ; then
  156. DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4"
  157. elif [ "$num_gpus" = "6" ] ; then
  158. DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5"
  159. elif [ "$num_gpus" = "7" ] ; then
  160. DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6"
  161. elif [ "$num_gpus" = "8" ] ; then
  162. DIST_CMD="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7"
  163. else
  164. echo "Wrong number of gpus"
  165. exit -2
  166. fi
  167. echo $DATA_DIR_PHASE1
  168. INPUT_DIR=$DATA_DIR_PHASE1
  169. CMD=" $CODEDIR/run_pretraining.py"
  170. CMD+=" --input-dir=$DATA_DIR_PHASE1"
  171. CMD+=" --vocab-file=$VOCAB_FILE"
  172. CMD+=" --output-dir=$CHECKPOINTS_DIR"
  173. CMD+=" $CONFIG "
  174. CMD+=" --bert-model=bert-large-uncased"
  175. CMD+=" --batch-size=$train_batch_size"
  176. CMD+=" --max-seq-length=128"
  177. CMD+=" --max-predictions-per-seq=20"
  178. CMD+=" --max-steps=$train_steps"
  179. CMD+=" --warmup-proportion=$warmup_proportion"
  180. CMD+=" --num-steps-per-checkpoint=$save_checkpoint_steps"
  181. CMD+=" --learning-rate=$learning_rate"
  182. CMD+=" --seed=$seed"
  183. CMD+=" --log-freq=1"
  184. CMD+=" --optimizer=Lamb"
  185. CMD+=" --phase1"
  186. CMD+=" $PREC"
  187. CMD+=" $FUSE_MHA"
  188. CMD+=" $ACCUMULATE_GRADIENTS"
  189. CMD+=" $INIT_CHECKPOINT"
  190. CMD+=" $BENCH"
  191. CMD+=" --report-file ${RESULTS_DIR}/dllogger_p1.json "
  192. CMD="$DIST_CMD $CMD"
  193. if [ "$create_logfile" = "true" ] ; then
  194. export GBS=$(expr $train_batch_size \* $num_gpus \* $gradient_accumulation_steps)
  195. printf -v TAG "paddle_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS
  196. DATESTAMP=`date +'%y%m%d%H%M%S'`
  197. LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
  198. printf "Logs written to %s\n" "$LOGFILE"
  199. fi
  200. set -x
  201. if [ -z "$LOGFILE" ] ; then
  202. $CMD
  203. else
  204. (
  205. $CMD
  206. ) |& tee $LOGFILE
  207. fi
  208. set +x
  209. echo "finished pretraining"
  210. #Start Phase2
  211. PREC=""
  212. if [ "$precision" = "amp" ] ; then
  213. PREC="--amp --use-dynamic-loss-scaling --scale-loss=1048576"
  214. elif [ "$precision" = "fp32" ] ; then
  215. PREC=""
  216. elif [ "$precision" = "tf32" ] ; then
  217. PREC=""
  218. else
  219. echo "Unknown <precision> argument"
  220. exit -2
  221. fi
  222. ACCUMULATE_GRADIENTS="--gradient-merge-steps=$gradient_accumulation_steps_phase2"
  223. if [ ! -d "${DATA_DIR_PHASE2}" ] || [ -z "$(ls -A ${DATA_DIR_PHASE2})" ]; then
  224. echo "Warning! ${DATA_DIR_PHASE2} directory missing."
  225. if [ ! -d "${wikipedia_source}" ] || [ -z "$(ls -A ${wikipedia_source})" ]; then
  226. echo "Error! ${wikipedia_source} directory missing. Training cannot start!"
  227. return -1
  228. fi
  229. preprocess_cmd=" \
  230. mpirun \
  231. --oversubscribe \
  232. --allow-run-as-root \
  233. -np ${num_dask_workers} \
  234. -x LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so \
  235. preprocess_bert_pretrain \
  236. --schedule mpi \
  237. --vocab-file ${VOCAB_FILE} \
  238. --wikipedia ${wikipedia_source} \
  239. --sink ${DATA_DIR_PHASE2} \
  240. --target-seq-length 512 \
  241. --num-blocks ${num_blocks} \
  242. --sample-ratio ${sample_ratio} \
  243. ${phase2_bin_size_flag} \
  244. ${masking_flag} \
  245. --seed ${seed}"
  246. echo "Running ${preprocess_cmd} ..."
  247. ${preprocess_cmd}
  248. balance_load_cmd=" \
  249. mpirun \
  250. --oversubscribe \
  251. --allow-run-as-root \
  252. -np ${num_dask_workers} \
  253. balance_dask_output \
  254. --indir ${DATA_DIR_PHASE2} \
  255. --num-shards ${num_blocks}"
  256. echo "Running ${balance_load_cmd} ..."
  257. ${balance_load_cmd}
  258. fi
  259. echo $DATA_DIR_PHASE2
  260. INPUT_DIR=$DATA_DIR_PHASE2
  261. PHASE1_END_CKPT_DIR="${CHECKPOINTS_DIR}/bert-large-uncased/phase1/${train_steps}"
  262. CMD=" $CODEDIR/run_pretraining.py"
  263. CMD+=" --input-dir=$DATA_DIR_PHASE2"
  264. CMD+=" --vocab-file=$VOCAB_FILE"
  265. CMD+=" --output-dir=$CHECKPOINTS_DIR"
  266. CMD+=" $CONFIG "
  267. CMD+=" --bert-model=bert-large-uncased"
  268. CMD+=" --batch-size=$train_batch_size_phase2"
  269. CMD+=" --max-seq-length=512"
  270. CMD+=" --max-predictions-per-seq=80"
  271. CMD+=" --max-steps=$train_steps_phase2"
  272. CMD+=" --warmup-proportion=$warmup_proportion_phase2"
  273. CMD+=" --num-steps-per-checkpoint=$save_checkpoint_steps"
  274. CMD+=" --learning-rate=$learning_rate_phase2"
  275. CMD+=" --seed=$seed"
  276. CMD+=" --log-freq=1"
  277. CMD+=" --optimizer=Lamb"
  278. CMD+=" $PREC"
  279. CMD+=" $ACCUMULATE_GRADIENTS"
  280. CMD+=" $BENCH"
  281. CMD+=" --from-pretrained-params=${PHASE1_END_CKPT_DIR} "
  282. CMD+=" --phase2 "
  283. CMD+=" --report-file ${RESULTS_DIR}/dllogger_p2.json "
  284. CMD="$DIST_CMD $CMD"
  285. if [ "$create_logfile" = "true" ] ; then
  286. export GBS=$(expr $train_batch_size_phase2 \* $num_gpus \* $gradient_accumulation_steps_phase2)
  287. printf -v TAG "paddle_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
  288. DATESTAMP=`date +'%y%m%d%H%M%S'`
  289. LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
  290. printf "Logs written to %s\n" "$LOGFILE"
  291. fi
  292. set -x
  293. if [ -z "$LOGFILE" ] ; then
  294. $CMD
  295. else
  296. (
  297. $CMD
  298. ) |& tee $LOGFILE
  299. fi
  300. set +x
  301. echo "finished phase2"