log_helper.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import subprocess
  16. import sys
  17. import itertools
  18. import atexit
  19. import dllogger
  20. from dllogger import Backend, JSONStreamBackend, StdOutBackend
  21. import torch.distributed as dist
  22. from torch.utils.tensorboard import SummaryWriter
  23. class TensorBoardBackend(Backend):
  24. def __init__(self, verbosity, log_dir):
  25. super().__init__(verbosity=verbosity)
  26. self.summary_writer = SummaryWriter(log_dir=os.path.join(log_dir, 'TB_summary'),
  27. flush_secs=120,
  28. max_queue=200
  29. )
  30. self.hp_cache = None
  31. atexit.register(self.summary_writer.close)
  32. @property
  33. def log_level(self):
  34. return self._log_level
  35. def metadata(self, timestamp, elapsedtime, metric, metadata):
  36. pass
  37. def log(self, timestamp, elapsedtime, step, data):
  38. if step == 'HPARAMS':
  39. parameters = {k: v for k, v in data.items() if not isinstance(v, (list, tuple))}
  40. #Unpack list and tuples
  41. for d in [{k+f'_{i}':v for i,v in enumerate(l)} for k,l in data.items() if isinstance(l, (list, tuple))]:
  42. parameters.update(d)
  43. #Remove custom classes
  44. parameters = {k: v for k, v in data.items() if isinstance(v, (int, float, str, bool))}
  45. parameters.update({k:'None' for k, v in data.items() if v is None})
  46. self.hp_cache = parameters
  47. if step == ():
  48. if self.hp_cache is None:
  49. print('Warning: Cannot save HParameters. Please log HParameters with step=\'HPARAMS\'', file=sys.stderr)
  50. return
  51. self.summary_writer.add_hparams(self.hp_cache, data)
  52. if not isinstance(step, int):
  53. return
  54. for k, v in data.items():
  55. self.summary_writer.add_scalar(k, v, step)
  56. def flush(self):
  57. pass
  58. def setup_logger(args):
  59. os.makedirs(args.results, exist_ok=True)
  60. log_path = os.path.join(args.results, args.log_file)
  61. if os.path.exists(log_path):
  62. for i in itertools.count():
  63. s_fname = args.log_file.split('.')
  64. fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
  65. log_path = os.path.join(args.results, fname)
  66. if not os.path.exists(log_path):
  67. break
  68. def metric_format(metric, metadata, value):
  69. return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
  70. def step_format(step):
  71. if step == ():
  72. return "Finished |"
  73. elif isinstance(step, int):
  74. return "Step {0: <5} |".format(step)
  75. return "Step {} |".format(step)
  76. if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
  77. dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
  78. TensorBoardBackend(verbosity=1, log_dir=args.results),
  79. StdOutBackend(verbosity=2,
  80. step_format=step_format,
  81. prefix_format=lambda x: "")#,
  82. #metric_format=metric_format)
  83. ])
  84. else:
  85. dllogger.init(backends=[])
  86. dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)
  87. container_setup_info = {**get_framework_env_vars(), **get_system_info()}
  88. dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)
  89. dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f', 'unit': None})
  90. dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f', 'unit': None})
  91. dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f', 'unit': None})
  92. dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f', 'unit': None})
  93. dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f', 'unit': 'items/s'})
  94. dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f', 'unit': None})
  95. dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f', 'unit': None})
  96. dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f', 'unit': None})
  97. dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f', 'unit': None})
  98. dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f', 'unit': 'items/s'})
  99. dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': None})
  100. dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': None})
  101. dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': None})
  102. dllogger.metadata('sum', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': None})
  103. dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f', 'unit': 'items/s'})
  104. dllogger.metadata('latency_avg', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': 's'})
  105. dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': 's'})
  106. dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': 's'})
  107. dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f', 'unit': 's'})
  108. dllogger.metadata('average_ips', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f', 'unit': 'items/s'})
  109. def get_framework_env_vars():
  110. return {
  111. 'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
  112. 'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
  113. 'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
  114. 'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
  115. 'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
  116. 'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
  117. 'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
  118. 'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
  119. 'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
  120. 'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
  121. }
  122. def get_system_info():
  123. system_info = subprocess.run('nvidia-smi --query-gpu=gpu_name,memory.total,enforced.power.limit --format=csv'.split(), capture_output=True).stdout
  124. system_info = [i.decode('utf-8') for i in system_info.split(b'\n')]
  125. system_info = [x for x in system_info if x]
  126. return {'system_info': system_info}