multiproc.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. # From PyTorch:
  2. #
  3. # Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
  4. # Copyright (c) 2016- Facebook, Inc (Adam Paszke)
  5. # Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
  6. # Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
  7. # Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
  8. # Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
  9. # Copyright (c) 2011-2013 NYU (Clement Farabet)
  10. # Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
  11. # Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
  12. # Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
  13. #
  14. # From Caffe2:
  15. #
  16. # Copyright (c) 2016-present, Facebook Inc. All rights reserved.
  17. #
  18. # All contributions by Facebook:
  19. # Copyright (c) 2016 Facebook Inc.
  20. #
  21. # All contributions by Google:
  22. # Copyright (c) 2015 Google Inc.
  23. # All rights reserved.
  24. #
  25. # All contributions by Yangqing Jia:
  26. # Copyright (c) 2015 Yangqing Jia
  27. # All rights reserved.
  28. #
  29. # All contributions from Caffe:
  30. # Copyright(c) 2013, 2014, 2015, the respective contributors
  31. # All rights reserved.
  32. #
  33. # All other contributions:
  34. # Copyright(c) 2015, 2016 the respective contributors
  35. # All rights reserved.
  36. #
  37. # Caffe2 uses a copyright model similar to Caffe: each contributor holds
  38. # copyright over their contributions to Caffe2. The project versioning records
  39. # all such contribution and copyright details. If a contributor wants to further
  40. # mark their specific copyright on a particular contribution, they should
  41. # indicate their copyright solely in the commit message of the change when it is
  42. # committed.
  43. #
  44. # All rights reserved.
  45. #
  46. # Redistribution and use in source and binary forms, with or without
  47. # modification, are permitted provided that the following conditions are met:
  48. #
  49. # 1. Redistributions of source code must retain the above copyright
  50. # notice, this list of conditions and the following disclaimer.
  51. #
  52. # 2. Redistributions in binary form must reproduce the above copyright
  53. # notice, this list of conditions and the following disclaimer in the
  54. # documentation and/or other materials provided with the distribution.
  55. #
  56. # 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
  57. # and IDIAP Research Institute nor the names of its contributors may be
  58. # used to endorse or promote products derived from this software without
  59. # specific prior written permission.
  60. #
  61. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  62. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  63. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  64. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  65. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  66. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  67. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  68. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  69. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  70. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  71. # POSSIBILITY OF SUCH DAMAGE.
  72. import sys
  73. import subprocess
  74. import os
  75. import socket
  76. import time
  77. from argparse import ArgumentParser, REMAINDER
  78. import torch
  79. def parse_args():
  80. """
  81. Helper function parsing the command line options
  82. @retval ArgumentParser
  83. """
  84. parser = ArgumentParser(description="PyTorch distributed training launch "
  85. "helper utilty that will spawn up "
  86. "multiple distributed processes")
  87. # Optional arguments for the launch helper
  88. parser.add_argument("--nnodes", type=int, default=1,
  89. help="The number of nodes to use for distributed "
  90. "training")
  91. parser.add_argument("--node_rank", type=int, default=0,
  92. help="The rank of the node for multi-node distributed "
  93. "training")
  94. parser.add_argument("--nproc_per_node", type=int, default=1,
  95. help="The number of processes to launch on each node, "
  96. "for GPU training, this is recommended to be set "
  97. "to the number of GPUs in your system so that "
  98. "each process can be bound to a single GPU.")
  99. parser.add_argument("--master_addr", default="127.0.0.1", type=str,
  100. help="Master node (rank 0)'s address, should be either "
  101. "the IP address or the hostname of node 0, for "
  102. "single node multi-proc training, the "
  103. "--master_addr can simply be 127.0.0.1")
  104. parser.add_argument("--master_port", default=29500, type=int,
  105. help="Master node (rank 0)'s free port that needs to "
  106. "be used for communciation during distributed "
  107. "training")
  108. # positional
  109. parser.add_argument("training_script", type=str,
  110. help="The full path to the single GPU training "
  111. "program/script to be launched in parallel, "
  112. "followed by all the arguments for the "
  113. "training script")
  114. # rest from the training program
  115. parser.add_argument('training_script_args', nargs=REMAINDER)
  116. return parser.parse_args()
  117. def main():
  118. args = parse_args()
  119. # world size in terms of number of processes
  120. dist_world_size = args.nproc_per_node * args.nnodes
  121. # set PyTorch distributed related environmental variables
  122. current_env = os.environ.copy()
  123. current_env["MASTER_ADDR"] = args.master_addr
  124. current_env["MASTER_PORT"] = str(args.master_port)
  125. current_env["WORLD_SIZE"] = str(dist_world_size)
  126. processes = []
  127. for local_rank in range(0, args.nproc_per_node):
  128. # each process's rank
  129. dist_rank = args.nproc_per_node * args.node_rank + local_rank
  130. current_env["RANK"] = str(dist_rank)
  131. # spawn the processes
  132. cmd = [sys.executable,
  133. "-u",
  134. args.training_script,
  135. "--local_rank={}".format(local_rank)] + args.training_script_args
  136. print(cmd)
  137. stdout = None if local_rank == 0 else open("GPU_"+str(local_rank)+".log", "w")
  138. process = subprocess.Popen(cmd, env=current_env, stdout=stdout)
  139. processes.append(process)
  140. try:
  141. up = True
  142. error = False
  143. while up and not error:
  144. up = False
  145. for p in processes:
  146. ret = p.poll()
  147. if ret is None:
  148. up = True
  149. elif ret != 0:
  150. error = True
  151. time.sleep(1)
  152. if error:
  153. for p in processes:
  154. if p.poll() is None:
  155. p.terminate()
  156. exit(1)
  157. except KeyboardInterrupt:
  158. for p in processes:
  159. p.terminate()
  160. raise
  161. except SystemExit:
  162. for p in processes:
  163. p.terminate()
  164. raise
  165. except:
  166. for p in processes:
  167. p.terminate()
  168. raise
  169. if __name__ == "__main__":
  170. main()