bind.sh 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
  2. # Licensed under the Apache License, Version 2.0 (the "License");
  3. # you may not use this file except in compliance with the License.
  4. # You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software
  9. # distributed under the License is distributed on an "AS IS" BASIS,
  10. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. # See the License for the specific language governing permissions and
  12. # limitations under the License.
  13. #! /bin/bash
  14. set -euo pipefail
  15. print_usage() {
  16. cat << EOF
  17. ${0} [options] [--] COMMAND [ARG...]
  18. Control binding policy for each task. Assumes one rank will be launched for each GPU.
  19. Options:
  20. --cpu=MODE
  21. * exclusive -- bind each rank to an exclusive set of cores near its GPU
  22. * exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
  23. * node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
  24. * *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
  25. * off -- don't bind
  26. --mem=MODE
  27. * node -- bind each rank to the nearest NUMA node [default]
  28. * *.sh -- bind each rank using the bash associative array bind_mem from a file
  29. * off -- don't bind
  30. --ib=MODE
  31. * single -- bind each rank to a single IB device near its GPU
  32. * off -- donot bind [default]
  33. --cluster=CLUSTER
  34. Select which cluster is being used. May be required if system params cannot be detected.
  35. EOF
  36. }
  37. ################################################################################
  38. # Argument parsing
  39. ################################################################################
  40. cpu_mode='node'
  41. mem_mode='node'
  42. ib_mode='off'
  43. cluster=''
  44. while [ $# -gt 0 ]; do
  45. case "$1" in
  46. -h|--help) print_usage ; exit 0 ;;
  47. --cpu=*) cpu_mode="${1/*=/}"; shift ;;
  48. --cpu) cpu_mode="$2"; shift 2 ;;
  49. --mem=*) mem_mode="${1/*=/}"; shift ;;
  50. --mem) mem_mode="$2"; shift 2 ;;
  51. --ib=*) ib_mode="${1/*=/}"; shift ;;
  52. --ib) ib_mode="$2"; shift 2 ;;
  53. --cluster=*) cluster="${1/*=/}"; shift ;;
  54. --cluster) cluster="$2"; shift 2 ;;
  55. --) shift; break ;;
  56. *) break ;;
  57. esac
  58. done
  59. if [ $# -lt 1 ]; then
  60. echo 'ERROR: no command given' 2>&1
  61. print_usage
  62. exit 1
  63. fi
  64. ################################################################################
  65. # Get system params
  66. ################################################################################
  67. # LOCAL_RANK is set with an enroot hook for Pytorch containers
  68. # SLURM_LOCALID is set by Slurm
  69. # OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
  70. readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
  71. if [ -z "${local_rank}" ]; then
  72. echo 'ERROR: cannot read LOCAL_RANK from env' >&2
  73. exit 1
  74. fi
  75. num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
  76. if [ "${local_rank}" -ge "${num_gpus}" ]; then
  77. echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
  78. exit 1
  79. fi
  80. get_lscpu_value() {
  81. awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
  82. }
  83. lscpu_out=$(lscpu)
  84. num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
  85. num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
  86. cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
  87. echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
  88. readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
  89. if [ ${num_gpus} -gt 1 ]; then
  90. readonly gpus_per_node=$(( num_gpus / num_nodes ))
  91. else
  92. readonly gpus_per_node=1
  93. fi
  94. readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
  95. readonly local_node=$(( local_rank / gpus_per_node ))
  96. declare -a ibdevs=()
  97. case "${cluster}" in
  98. circe)
  99. # Need to specialize for circe because IB detection is hard
  100. ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
  101. ;;
  102. selene)
  103. # Need to specialize for selene because IB detection is hard
  104. ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
  105. ;;
  106. '')
  107. if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
  108. mapfile -t ibdevs <<< "${ibstat_out}"
  109. fi
  110. ;;
  111. *)
  112. echo "ERROR: Unknown cluster '${cluster}'" >&2
  113. exit 1
  114. ;;
  115. esac
  116. readonly num_ibdevs="${#ibdevs[@]}"
  117. ################################################################################
  118. # Setup for exec
  119. ################################################################################
  120. declare -a numactl_args=()
  121. case "${cpu_mode}" in
  122. exclusive)
  123. numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
  124. $(( local_rank * cores_per_gpu )) \
  125. $(( (local_rank + 1) * cores_per_gpu - 1 )) \
  126. $(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
  127. $(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
  128. )" )
  129. ;;
  130. exclusive,nosmt)
  131. numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
  132. $(( local_rank * cores_per_gpu )) \
  133. $(( (local_rank + 1) * cores_per_gpu - 1 )) \
  134. )" )
  135. ;;
  136. node)
  137. numactl_args+=( "--cpunodebind=${local_node}" )
  138. ;;
  139. *.sh)
  140. source "${cpu_mode}"
  141. if [ -n "${bind_cpu_cores:-}" ]; then
  142. numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
  143. elif [ -n "${bind_cpu_nodes:-}" ]; then
  144. numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
  145. else
  146. echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
  147. exit 1
  148. fi
  149. ;;
  150. off|'')
  151. ;;
  152. *)
  153. echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
  154. print_usage
  155. exit 1
  156. ;;
  157. esac
  158. case "${mem_mode}" in
  159. node)
  160. numactl_args+=( "--membind=${local_node}" )
  161. ;;
  162. *.sh)
  163. source "${mem_mode}"
  164. if [ -z "${bind_mem:-}" ]; then
  165. echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
  166. exit 1
  167. fi
  168. numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
  169. ;;
  170. off|'')
  171. ;;
  172. *)
  173. echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
  174. print_usage
  175. exit 1
  176. ;;
  177. esac
  178. case "${ib_mode}" in
  179. single)
  180. if [ "${num_ibdevs}" -eq 0 ]; then
  181. echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
  182. else
  183. readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
  184. export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
  185. fi
  186. ;;
  187. off|'')
  188. ;;
  189. *)
  190. echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
  191. print_usage
  192. exit 1
  193. ;;
  194. esac
  195. ################################################################################
  196. # Exec
  197. ################################################################################
  198. if [ "${#numactl_args[@]}" -gt 0 ] ; then
  199. set -x
  200. exec numactl "${numactl_args[@]}" -- "${@}"
  201. else
  202. exec "${@}"
  203. fi