distributed_train.sh 1.2 KB

12345678910111213141516171819202122232425262728293031323334
  1. #!/bin/bash
  2. # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. NUM_PROC=$1
  16. shift
  17. mkdir ./EFFICIENTDET_DGX1_perf-train_AMP_NGPU8_BS-30
  18. declare -a CMD
  19. if [ -n "${SLURM_LOCALID-}" ]; then
  20. # Mode 1: Slurm launched a task for each GPU and set some envvars; no need for parallel launch
  21. if [ "${SLURM_NTASKS}" -gt "${SLURM_JOB_NUM_NODES}" ]; then
  22. CMD=( './bind.sh' '--cpu=exclusive' '--' 'python' '-u' )
  23. else
  24. CMD=( 'python' '-u' )
  25. fi
  26. else
  27. # Mode 2: Single-node Docker; need to launch tasks with Pytorch's distributed launch
  28. CMD=( 'python' '-u' '-m' 'bind_launch' "--nproc_per_node=${NUM_PROC}" )
  29. fi
  30. "${CMD[@]}" train.py "$@"