run.sub 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. #!/bin/bash
  2. #SBATCH -N 8 # number of nodes
  3. #SBATCH -t 4:00:00 # wall time
  4. #SBATCH -J "transformer-xl_pyt" # job name
  5. #SBATCH --ntasks-per-node=16 # tasks per node
  6. #SBATCH --exclusive # exclusive node access
  7. #SBATCH --mem=0 # all mem avail
  8. #SBATCH --mail-type=FAIL # only send email on failure
  9. #SBATCH --overcommit # Needed for pytorch
  10. # Configure the multinode script
  11. SCRIPT="run_multinode_wt103_large.sh"
  12. # Configure workspace in the file system
  13. WORK_DIR="/gpfs/fs1/${USER}/transformer-xl_pyt"
  14. CONT_WORK_DIR="/workspace/transformer-xl"
  15. # Configure container and mounting paths
  16. CONT="<YOUR DOCKER REGISTRY>/transformer-xl:latest"
  17. MOUNTS="${WORK_DIR}/data:${CONT_WORK_DIR}/data,${WORK_DIR}/results:${CONT_WORK_DIR}/results"
  18. # Create directory for the results
  19. srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${WORK_DIR}/results"
  20. # Overwrite default parameters in the script
  21. EXTRA_TRAIN_PARAMS=""
  22. EXTRA_EVAL_PARAMS=""
  23. if [ -n "$MAX_STEP" ]; then
  24. EXTRA_TRAIN_PARAMS+="--max_step ${MAX_STEP} "
  25. fi
  26. if [ -n "$SEED" ]; then
  27. EXTRA_TRAIN_PARAMS+="--seed ${SEED} "
  28. fi
  29. if [ -n "$BATCH_SIZE" ]; then
  30. EXTRA_TRAIN_PARAMS+="--batch_size ${BATCH_SIZE} "
  31. fi
  32. if [ -n "$LOCAL_BATCH_SIZE" ]; then
  33. EXTRA_TRAIN_PARAMS+="--local_batch_size ${LOCAL_BATCH_SIZE} "
  34. fi
  35. if [ -n "$BATCH_CHUNK" ]; then
  36. EXTRA_TRAIN_PARAMS+="--batch_chunk ${BATCH_CHUNK} "
  37. fi
  38. if [ -n "$EVAL_INTERVAL" ]; then
  39. EXTRA_TRAIN_PARAMS+="--eval_interval ${EVAL_INTERVAL} "
  40. fi
  41. if [ -n "$LOG_INTERVAL" ]; then
  42. EXTRA_TRAIN_PARAMS+="--log_interval ${LOG_INTERVAL} "
  43. fi
  44. if [ -n "$EVAL_BATCH_SIZE" ]; then
  45. EXTRA_TRAIN_PARAMS+="--eval_batch_size ${EVAL_BATCH_SIZE} "
  46. EXTRA_EVAL_PARAMS+="--batch_size ${EVAL_BATCH_SIZE} "
  47. fi
  48. if [ -n "$LR" ]; then
  49. EXTRA_TRAIN_PARAMS+="--lr ${LR} "
  50. fi
  51. if [ -n "$FP16" ]; then
  52. EXTRA_TRAIN_PARAMS+="--fp16 "
  53. EXTRA_EVAL_PARAMS+="--fp16 "
  54. fi
  55. if [ -n "$CONFIG" ]; then
  56. EXTRA_TRAIN_PARAMS+="--config ${CONFIG} "
  57. EXTRA_EVAL_PARAMS+="--config ${CONFIG} "
  58. fi
  59. if [[ $1 == 'train' ]] || [[ $1 == 'all' ]]; then
  60. # Run training
  61. srun --mpi=none \
  62. --container-image="${CONT}" \
  63. --container-mounts="${MOUNTS}" \
  64. bash "${SCRIPT}" train \
  65. --work_dir ${CONT_WORK_DIR}/results/${SLURM_JOB_ID} \
  66. "${EXTRA_TRAIN_PARAMS}"
  67. fi
  68. if [[ $1 == 'eval' ]] || [[ $1 == 'all' ]]; then
  69. # Run final evaluation
  70. srun --mpi=none \
  71. --container-image="${CONT}" \
  72. --container-mounts="${MOUNTS}" \
  73. bash "${SCRIPT}" eval \
  74. --work_dir ${CONT_WORK_DIR}/results/${SLURM_JOB_ID} \
  75. "${EXTRA_EVAL_PARAMS}"
  76. fi
  77. if [[ $1 != 'train' ]] && [[ $1 != 'eval' ]] && [[ $1 != 'all' ]]; then
  78. echo 'unknown argment 1'
  79. fi