slurm_multinode.sh 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. #!/bin/bash
  2. # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # author: Tomasz Grel ([email protected])
  17. # This is a generic SLURM batch script. It runs $cmd
  18. # command in $cont docker image while mounting $mounts directories.
  19. # You can use the $srun_flags variable to pass additional
  20. # arguments to srun.
  21. #
  22. # It is designed to work with enroot/pyxis, but could be modified
  23. # to run on bare-metal machines as well.
  24. #
  25. # Example usage to train a 1.68TB DLRM variant using 32xA100-80GB GPUs on 4 nodes:
  26. #
  27. # cmd='numactl --interleave=all -- python -u main.py --dataset_path /data/dlrm/full_criteo_data --amp \
  28. # --embedding_dim 512 --bottom_mlp_dims 512,256,512' \
  29. # srun_flags='--mpi=pmix' \
  30. # cont=dlrm_tf_adam \
  31. # mounts=/data/dlrm:/data/dlrm \
  32. # sbatch -n 32 -N 4 -t 00:20:00 slurm_multinode.sh
  33. #
  34. srun --mpi=none ${srun_flags} --ntasks-per-node=1 \
  35. --container-image="${cont}" --container-mounts=${mounts} /bin/bash -c "$cmd"