multiproc.py 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. # *****************************************************************************
  2. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions are met:
  6. # * Redistributions of source code must retain the above copyright
  7. # notice, this list of conditions and the following disclaimer.
  8. # * Redistributions in binary form must reproduce the above copyright
  9. # notice, this list of conditions and the following disclaimer in the
  10. # documentation and/or other materials provided with the distribution.
  11. # * Neither the name of the NVIDIA CORPORATION nor the
  12. # names of its contributors may be used to endorse or promote products
  13. # derived from this software without specific prior written permission.
  14. #
  15. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  16. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  17. # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  19. # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  20. # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  21. # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  22. # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  24. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. #
  26. # *****************************************************************************
  27. import sys
  28. import subprocess
  29. import torch
  30. def main():
  31. argslist = list(sys.argv)[1:]
  32. world_size = torch.cuda.device_count()
  33. if '--set-world-size' in argslist:
  34. idx = argslist.index('--set-world-size')
  35. world_size = int(argslist[idx + 1])
  36. del argslist[idx + 1]
  37. del argslist[idx]
  38. if '--world-size' in argslist:
  39. argslist[argslist.index('--world-size') + 1] = str(world_size)
  40. else:
  41. argslist.append('--world-size')
  42. argslist.append(str(world_size))
  43. workers = []
  44. for i in range(world_size):
  45. if '--rank' in argslist:
  46. argslist[argslist.index('--rank') + 1] = str(i)
  47. else:
  48. argslist.append('--rank')
  49. argslist.append(str(i))
  50. stdout = None if i == 0 else subprocess.DEVNULL
  51. worker = subprocess.Popen(
  52. [str(sys.executable)] + argslist, stdout=stdout)
  53. workers.append(worker)
  54. returncode = 0
  55. try:
  56. pending = len(workers)
  57. while pending > 0:
  58. for worker in workers:
  59. try:
  60. worker_returncode = worker.wait(1)
  61. except subprocess.TimeoutExpired:
  62. continue
  63. pending -= 1
  64. if worker_returncode != 0:
  65. if returncode != 1:
  66. for worker in workers:
  67. worker.terminate()
  68. returncode = 1
  69. except KeyboardInterrupt:
  70. print('Pressed CTRL-C, TERMINATING')
  71. for worker in workers:
  72. worker.terminate()
  73. for worker in workers:
  74. worker.wait()
  75. raise
  76. sys.exit(returncode)
  77. if __name__ == "__main__":
  78. main()