train.sh 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. #!/usr/bin/env bash
  2. export OMP_NUM_THREADS=1
  3. : ${NUM_GPUS:=8}
  4. : ${BATCH_SIZE:=16}
  5. : ${GRAD_ACCUMULATION:=2}
  6. : ${OUTPUT_DIR:="./output"}
  7. : ${LOG_FILE:=$OUTPUT_DIR/nvlog.json}
  8. : ${DATASET_PATH:=LJSpeech-1.1}
  9. : ${TRAIN_FILELIST:=filelists/ljs_audio_pitch_text_train_v3.txt}
  10. : ${VAL_FILELIST:=filelists/ljs_audio_pitch_text_val.txt}
  11. : ${AMP:=false}
  12. : ${SEED:=""}
  13. : ${LEARNING_RATE:=0.1}
  14. # Adjust these when the amount of data changes
  15. : ${EPOCHS:=1000}
  16. : ${EPOCHS_PER_CHECKPOINT:=20}
  17. : ${WARMUP_STEPS:=1000}
  18. : ${KL_LOSS_WARMUP:=100}
  19. # Train a mixed phoneme/grapheme model
  20. : ${PHONE:=true}
  21. # Enable energy conditioning
  22. : ${ENERGY:=true}
  23. : ${TEXT_CLEANERS:=english_cleaners_v2}
  24. # Add dummy space prefix/suffix is audio is not precisely trimmed
  25. : ${APPEND_SPACES:=false}
  26. : ${LOAD_PITCH_FROM_DISK:=true}
  27. : ${LOAD_MEL_FROM_DISK:=false}
  28. # For multispeaker models, add speaker ID = {0, 1, ...} as the last filelist column
  29. : ${NSPEAKERS:=1}
  30. : ${SAMPLING_RATE:=22050}
  31. # Adjust env variables to maintain the global batch size: NUM_GPUS x BATCH_SIZE x GRAD_ACCUMULATION = 256.
  32. GBS=$(($NUM_GPUS * $BATCH_SIZE * $GRAD_ACCUMULATION))
  33. [ $GBS -ne 256 ] && echo -e "\nWARNING: Global batch size changed from 256 to ${GBS}."
  34. echo -e "\nAMP=$AMP, ${NUM_GPUS}x${BATCH_SIZE}x${GRAD_ACCUMULATION}" \
  35. "(global batch size ${GBS})\n"
  36. # ARGS=""
  37. ARGS+=" --cuda"
  38. ARGS+=" -o $OUTPUT_DIR"
  39. ARGS+=" --log-file $LOG_FILE"
  40. ARGS+=" --dataset-path $DATASET_PATH"
  41. ARGS+=" --training-files $TRAIN_FILELIST"
  42. ARGS+=" --validation-files $VAL_FILELIST"
  43. ARGS+=" -bs $BATCH_SIZE"
  44. ARGS+=" --grad-accumulation $GRAD_ACCUMULATION"
  45. ARGS+=" --optimizer lamb"
  46. ARGS+=" --epochs $EPOCHS"
  47. ARGS+=" --epochs-per-checkpoint $EPOCHS_PER_CHECKPOINT"
  48. ARGS+=" --warmup-steps $WARMUP_STEPS"
  49. ARGS+=" -lr $LEARNING_RATE"
  50. ARGS+=" --weight-decay 1e-6"
  51. ARGS+=" --grad-clip-thresh 1000.0"
  52. ARGS+=" --dur-predictor-loss-scale 0.1"
  53. ARGS+=" --pitch-predictor-loss-scale 0.1"
  54. ARGS+=" --trainloader-repeats 100"
  55. ARGS+=" --validation-freq 10"
  56. # Autoalign & new features
  57. ARGS+=" --kl-loss-start-epoch 0"
  58. ARGS+=" --kl-loss-warmup-epochs $KL_LOSS_WARMUP"
  59. ARGS+=" --text-cleaners $TEXT_CLEANERS"
  60. ARGS+=" --n-speakers $NSPEAKERS"
  61. [ "$AMP" = "true" ] && ARGS+=" --amp"
  62. [ "$PHONE" = "true" ] && ARGS+=" --p-arpabet 1.0"
  63. [ "$ENERGY" = "true" ] && ARGS+=" --energy-conditioning"
  64. [ "$SEED" != "" ] && ARGS+=" --seed $SEED"
  65. [ "$LOAD_MEL_FROM_DISK" = true ] && ARGS+=" --load-mel-from-disk"
  66. [ "$LOAD_PITCH_FROM_DISK" = true ] && ARGS+=" --load-pitch-from-disk"
  67. [ "$PITCH_ONLINE_DIR" != "" ] && ARGS+=" --pitch-online-dir $PITCH_ONLINE_DIR" # e.g., /dev/shm/pitch
  68. [ "$PITCH_ONLINE_METHOD" != "" ] && ARGS+=" --pitch-online-method $PITCH_ONLINE_METHOD"
  69. [ "$APPEND_SPACES" = true ] && ARGS+=" --prepend-space-to-text"
  70. [ "$APPEND_SPACES" = true ] && ARGS+=" --append-space-to-text"
  71. [[ "$ARGS" != *"--checkpoint-path"* ]] && ARGS+=" --resume"
  72. if [ "$SAMPLING_RATE" == "44100" ]; then
  73. ARGS+=" --sampling-rate 44100"
  74. ARGS+=" --filter-length 2048"
  75. ARGS+=" --hop-length 512"
  76. ARGS+=" --win-length 2048"
  77. ARGS+=" --mel-fmin 0.0"
  78. ARGS+=" --mel-fmax 22050.0"
  79. elif [ "$SAMPLING_RATE" != "22050" ]; then
  80. echo "Unknown sampling rate $SAMPLING_RATE"
  81. exit 1
  82. fi
  83. mkdir -p "$OUTPUT_DIR"
  84. : ${DISTRIBUTED:="-m torch.distributed.launch --nproc_per_node $NUM_GPUS"}
  85. python $DISTRIBUTED train.py $ARGS "$@"