train_benchmark.sh 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #!/bin/bash
  15. echo "Container nvidia build = " $NVIDIA_BUILD_ID
  16. SCRIPT_DIR=$(cd $(dirname $0); pwd)
  17. PROJECT_DIR=${SCRIPT_DIR}/..
  18. DATA_DIR=${1:-"/datasets/LibriSpeech"}
  19. MODEL_CONFIG=${2:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
  20. RESULT_DIR=${3:-"/results"}
  21. CREATE_LOGFILE=${4:-"true"}
  22. CUDNN_BENCHMARK=${5:-"true"}
  23. NUM_GPUS=${6:-8}
  24. PRECISION=${7:-"fp16"}
  25. NUM_STEPS=${8:-"-1"}
  26. MAX_DURATION=${9:-16.7}
  27. SEED=${10:-0}
  28. BATCH_SIZE=${11:-64}
  29. LEARNING_RATE=${12:-"0.015"}
  30. GRADIENT_ACCUMULATION_STEPS=${13:-1}
  31. PRINT_FREQUENCY=${14:-1}
  32. USE_PROFILER=${USE_PROFILER:-"false"}
  33. PREC=""
  34. if [ "$PRECISION" = "fp16" ] ; then
  35. PREC=" --fp16"
  36. elif [ "$PRECISION" = "fp32" ] ; then
  37. PREC=""
  38. else
  39. echo "Unknown <precision> argument"
  40. exit -2
  41. fi
  42. STEPS=""
  43. if [ "$NUM_STEPS" -ne "-1" ] ; then
  44. STEPS=" --num_steps=$NUM_STEPS"
  45. elif [ "$NUM_STEPS" = "-1" ] ; then
  46. STEPS=""
  47. else
  48. echo "Unknown <precision> argument"
  49. exit -2
  50. fi
  51. CUDNN=""
  52. if [ "$CUDNN_BENCHMARK" = "true" ] ; then
  53. CUDNN=" --cudnn"
  54. else
  55. CUDNN=""
  56. fi
  57. if [ "${USE_PROFILER}" = "true" ] ; then
  58. PYTHON_ARGS+="-m cProfile -s cumtime"
  59. fi
  60. CMD="${PYTHON_ARGS} ${PROJECT_DIR}/train.py"
  61. CMD+=" --batch_size=$BATCH_SIZE"
  62. CMD+=" --num_epochs=400"
  63. CMD+=" --output_dir=$RESULT_DIR"
  64. CMD+=" --model_toml=$MODEL_CONFIG"
  65. CMD+=" --lr=$LEARNING_RATE"
  66. CMD+=" --seed=$SEED"
  67. CMD+=" --optimizer=novograd"
  68. CMD+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS"
  69. CMD+=" --dataset_dir=$DATA_DIR"
  70. CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json"
  71. CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,$DATA_DIR/librispeech-train-clean-360-wav.json,$DATA_DIR/librispeech-train-other-500-wav.json"
  72. CMD+=" --weight_decay=1e-3"
  73. CMD+=" --save_freq=100000"
  74. CMD+=" --eval_freq=100000"
  75. CMD+=" --max_duration=$MAX_DURATION"
  76. CMD+=" --pad_to_max"
  77. CMD+=" --train_freq=$PRINT_FREQUENCY"
  78. CMD+=" --lr_decay "
  79. CMD+=" $CUDNN"
  80. CMD+=" $PREC"
  81. CMD+=" $STEPS"
  82. if [ "$NUM_GPUS" -gt 1 ] ; then
  83. CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD"
  84. else
  85. CMD="python3 $CMD"
  86. fi
  87. if [ "$CREATE_LOGFILE" = "true" ] ; then
  88. export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
  89. printf -v TAG "jasper_train_benchmark_%s_gbs%d" "$PRECISION" $GBS
  90. DATESTAMP=`date +'%y%m%d%H%M%S'`
  91. LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
  92. printf "Logs written to %s\n" "$LOGFILE"
  93. fi
  94. if [ -z "$LOGFILE" ] ; then
  95. set -x
  96. $CMD
  97. set +x
  98. else
  99. set -x
  100. (
  101. $CMD
  102. ) |& tee "$LOGFILE"
  103. set +x
  104. mean_latency=`cat "$LOGFILE" | grep 'Step time' | awk '{print $3}' | tail -n +2 | egrep -o '[0-9.]+'| awk 'BEGIN {total=0} {total+=$1} END {printf("%.2f\n",total/NR)}'`
  105. mean_throughput=`python -c "print($BATCH_SIZE*$NUM_GPUS/${mean_latency})"`
  106. training_wer_per_pgu=`cat "$LOGFILE" | grep 'training_batch_WER'| awk '{print $2}' | tail -n 1 | egrep -o '[0-9.]+'`
  107. training_loss_per_pgu=`cat "$LOGFILE" | grep 'Loss@Step'| awk '{print $4}' | tail -n 1 | egrep -o '[0-9.]+'`
  108. final_eval_wer=`cat "$LOGFILE" | grep 'Evaluation WER'| tail -n 1 | egrep -o '[0-9.]+'`
  109. final_eval_loss=`cat "$LOGFILE" | grep 'Evaluation Loss'| tail -n 1 | egrep -o '[0-9.]+'`
  110. echo "max duration: $MAX_DURATION s" | tee -a "$LOGFILE"
  111. echo "mean_latency: $mean_latency s" | tee -a "$LOGFILE"
  112. echo "mean_throughput: $mean_throughput sequences/s" | tee -a "$LOGFILE"
  113. echo "training_wer_per_pgu: $training_wer_per_pgu" | tee -a "$LOGFILE"
  114. echo "training_loss_per_pgu: $training_loss_per_pgu" | tee -a "$LOGFILE"
  115. echo "final_eval_loss: $final_eval_loss" | tee -a "$LOGFILE"
  116. echo "final_eval_wer: $final_eval_wer" | tee -a "$LOGFILE"
  117. fi