multiproc.py 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. # *****************************************************************************
  2. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions are met:
  6. # * Redistributions of source code must retain the above copyright
  7. # notice, this list of conditions and the following disclaimer.
  8. # * Redistributions in binary form must reproduce the above copyright
  9. # notice, this list of conditions and the following disclaimer in the
  10. # documentation and/or other materials provided with the distribution.
  11. # * Neither the name of the NVIDIA CORPORATION nor the
  12. # names of its contributors may be used to endorse or promote products
  13. # derived from this software without specific prior written permission.
  14. #
  15. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  16. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  17. # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  19. # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  20. # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  21. # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  22. # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  24. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. #
  26. # *****************************************************************************
  27. import sys
  28. import subprocess
  29. import torch
  30. def main():
  31. argslist = list(sys.argv)[1:]
  32. world_size = torch.cuda.device_count()
  33. if '--world-size' in argslist:
  34. argslist[argslist.index('--world-size') + 1] = str(world_size)
  35. else:
  36. argslist.append('--world-size')
  37. argslist.append(str(world_size))
  38. workers = []
  39. for i in range(world_size):
  40. if '--rank' in argslist:
  41. argslist[argslist.index('--rank') + 1] = str(i)
  42. else:
  43. argslist.append('--rank')
  44. argslist.append(str(i))
  45. stdout = None if i == 0 else subprocess.DEVNULL
  46. worker = subprocess.Popen(
  47. [str(sys.executable)] + argslist, stdout=stdout)
  48. workers.append(worker)
  49. returncode = 0
  50. try:
  51. pending = len(workers)
  52. while pending > 0:
  53. for worker in workers:
  54. try:
  55. worker_returncode = worker.wait(1)
  56. except subprocess.TimeoutExpired:
  57. continue
  58. pending -= 1
  59. if worker_returncode != 0:
  60. if returncode != 1:
  61. for worker in workers:
  62. worker.terminate()
  63. returncode = 1
  64. except KeyboardInterrupt:
  65. print('Pressed CTRL-C, TERMINATING')
  66. for worker in workers:
  67. worker.terminate()
  68. for worker in workers:
  69. worker.wait()
  70. raise
  71. sys.exit(returncode)
  72. if __name__ == "__main__":
  73. main()