bind.sh 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #! /bin/bash
  15. set -euo pipefail
  16. print_usage() {
  17. cat << EOF
  18. ${0} [options] [--] COMMAND [ARG...]
  19. Control binding policy for each task. Assumes one rank will be launched for each GPU.
  20. Options:
  21. --cpu=MODE
  22. * exclusive -- bind each rank to an exclusive set of cores near its GPU
  23. * exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
  24. * node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
  25. * *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
  26. * off -- don't bind
  27. --mem=MODE
  28. * node -- bind each rank to the nearest NUMA node [default]
  29. * *.sh -- bind each rank using the bash associative array bind_mem from a file
  30. * off -- don't bind
  31. --ib=MODE
  32. * single -- bind each rank to a single IB device near its GPU
  33. * off -- don't bind [default]
  34. EOF
  35. }
  36. ################################################################################
  37. # Argument parsing
  38. ################################################################################
  39. cpu_mode='node'
  40. mem_mode='node'
  41. ib_mode='off'
  42. while [ $# -gt 0 ]; do
  43. case "$1" in
  44. -h|--help) print_usage ; exit 0 ;;
  45. --cpu=*) cpu_mode="${1/*=/}"; shift ;;
  46. --cpu) cpu_mode="$2"; shift 2 ;;
  47. --mem=*) mem_mode="${1/*=/}"; shift ;;
  48. --mem) mem_mode="$2"; shift 2 ;;
  49. --ib=*) ib_mode="${1/*=/}"; shift ;;
  50. --ib) ib_mode="$2"; shift 2 ;;
  51. --) shift; break ;;
  52. *) break ;;
  53. esac
  54. done
  55. if [ $# -lt 1 ]; then
  56. echo 'ERROR: no command given' 2>&1
  57. print_usage
  58. exit 1
  59. fi
  60. ################################################################################
  61. # Get system params
  62. ################################################################################
  63. # LOCAL_RANK is set with an enroot hook for Pytorch containers
  64. # SLURM_LOCALID is set by Slurm
  65. # OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
  66. readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
  67. if [ -z "${local_rank}" ]; then
  68. echo 'ERROR: cannot read LOCAL_RANK from env' >&2
  69. exit 1
  70. fi
  71. num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
  72. if [ "${local_rank}" -ge "${num_gpus}" ]; then
  73. echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
  74. exit 1
  75. fi
  76. get_lscpu_value() {
  77. awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
  78. }
  79. lscpu_out=$(lscpu)
  80. num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
  81. num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
  82. cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
  83. echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
  84. readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
  85. if [ ${num_gpus} -gt 1 ]; then
  86. readonly gpus_per_node=$(( num_gpus / num_nodes ))
  87. else
  88. readonly gpus_per_node=1
  89. fi
  90. readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
  91. readonly local_node=$(( local_rank / gpus_per_node ))
  92. declare -a ibdevs=()
  93. readonly num_ibdevs="${#ibdevs[@]}"
  94. ################################################################################
  95. # Setup for exec
  96. ################################################################################
  97. declare -a numactl_args=()
  98. case "${cpu_mode}" in
  99. exclusive)
  100. numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
  101. $(( local_rank * cores_per_gpu )) \
  102. $(( (local_rank + 1) * cores_per_gpu - 1 )) \
  103. $(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
  104. $(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
  105. )" )
  106. ;;
  107. exclusive,nosmt)
  108. numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
  109. $(( local_rank * cores_per_gpu )) \
  110. $(( (local_rank + 1) * cores_per_gpu - 1 )) \
  111. )" )
  112. ;;
  113. node)
  114. numactl_args+=( "--cpunodebind=${local_node}" )
  115. ;;
  116. *.sh)
  117. source "${cpu_mode}"
  118. if [ -n "${bind_cpu_cores:-}" ]; then
  119. numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
  120. elif [ -n "${bind_cpu_nodes:-}" ]; then
  121. numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
  122. else
  123. echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
  124. exit 1
  125. fi
  126. ;;
  127. off|'')
  128. ;;
  129. *)
  130. echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
  131. print_usage
  132. exit 1
  133. ;;
  134. esac
  135. case "${mem_mode}" in
  136. node)
  137. numactl_args+=( "--membind=${local_node}" )
  138. ;;
  139. *.sh)
  140. source "${mem_mode}"
  141. if [ -z "${bind_mem:-}" ]; then
  142. echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
  143. exit 1
  144. fi
  145. numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
  146. ;;
  147. off|'')
  148. ;;
  149. *)
  150. echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
  151. print_usage
  152. exit 1
  153. ;;
  154. esac
  155. case "${ib_mode}" in
  156. single)
  157. if [ "${num_ibdevs}" -eq 0 ]; then
  158. echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
  159. else
  160. readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
  161. export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
  162. export UCX_NET_DEVICES="${UCX_NET_DEVICES-$ibdev:1}"
  163. fi
  164. ;;
  165. off|'')
  166. ;;
  167. *)
  168. echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
  169. print_usage
  170. exit 1
  171. ;;
  172. esac
  173. ################################################################################
  174. # Exec
  175. ################################################################################
  176. if [ "${#numactl_args[@]}" -gt 0 ] ; then
  177. exec numactl "${numactl_args[@]}" -- "${@}"
  178. else
  179. exec "${@}"
  180. fi