logger.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import time
  16. import dllogger as logger
  17. import numpy as np
  18. import torch
  19. from dllogger import JSONStreamBackend, StdOutBackend, Verbosity
  20. from pytorch_lightning import Callback
  21. from pytorch_lightning.utilities import rank_zero_only
  22. class DLLogger:
  23. def __init__(self, log_dir, filename, append=True):
  24. super().__init__()
  25. self._initialize_dllogger(log_dir, filename, append)
  26. @rank_zero_only
  27. def _initialize_dllogger(self, log_dir, filename, append):
  28. backends = [
  29. JSONStreamBackend(Verbosity.VERBOSE, os.path.join(log_dir, filename), append=append),
  30. StdOutBackend(Verbosity.VERBOSE),
  31. ]
  32. logger.init(backends=backends)
  33. @rank_zero_only
  34. def log_metrics(self, metrics, step=None):
  35. if step is None:
  36. step = ()
  37. logger.log(step=step, data=metrics)
  38. @rank_zero_only
  39. def log_metadata(self, metric, metadata):
  40. logger.metadata(metric, metadata)
  41. @rank_zero_only
  42. def flush(self):
  43. logger.flush()
  44. class LoggingCallback(Callback):
  45. def __init__(self, log_dir, filnename, global_batch_size, mode, warmup, dim):
  46. self.dllogger = DLLogger(log_dir, filnename)
  47. self.warmup_steps = warmup
  48. self.global_batch_size = global_batch_size
  49. self.step = 0
  50. self.dim = dim
  51. self.mode = mode
  52. self.timestamps = []
  53. self.dllogger.log_metadata("dice_score", {"unit": None})
  54. self.dllogger.log_metadata(f"throughput_{self.mode}", {"unit": "images/s"})
  55. self.dllogger.log_metadata(f"latency_{self.mode}_mean", {"unit": "ms"})
  56. for level in [90, 95, 99]:
  57. self.dllogger.log_metadata(f"latency_{self.mode}_{level}", {"unit": "ms"})
  58. def do_step(self):
  59. if self.step > self.warmup_steps:
  60. self.step += 1
  61. return
  62. torch.cuda.synchronize()
  63. self.timestamps.append(time.perf_counter())
  64. def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
  65. if trainer.current_epoch == 1:
  66. self.do_step()
  67. def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
  68. if pl_module.start_benchmark == 1:
  69. self.do_step()
  70. def process_performance_stats(self):
  71. def _round3(val):
  72. return round(val, 3)
  73. elapsed_times = np.diff(self.timestamps)
  74. throughput_imgps = _round3(self.global_batch_size / np.mean(elapsed_times))
  75. timestamps_ms = 1000 * elapsed_times
  76. stats = {
  77. f"throughput_{self.mode}": throughput_imgps,
  78. f"latency_{self.mode}_mean": _round3(np.mean(timestamps_ms)),
  79. }
  80. for level in [90, 95, 99]:
  81. stats.update({f"latency_{self.mode}_{level}": _round3(np.percentile(timestamps_ms, level))})
  82. return stats
  83. @rank_zero_only
  84. def _log(self):
  85. stats = self.process_performance_stats()
  86. self.dllogger.log_metrics(metrics=stats)
  87. self.dllogger.flush()
  88. def on_train_end(self, trainer, pl_module):
  89. self._log()
  90. def on_test_end(self, trainer, pl_module):
  91. if pl_module.start_benchmark == 1:
  92. self._log()