utils.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
  2. # Licensed under the Apache License, Version 2.0 (the "License");
  3. # you may not use this file except in compliance with the License.
  4. # You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software
  9. # distributed under the License is distributed on an "AS IS" BASIS,
  10. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. # See the License for the specific language governing permissions and
  12. # limitations under the License.
  13. import tensorflow as tf
  14. import time
  15. # report latency and throughput during eval
  16. class LogEvalRunHook(tf.estimator.SessionRunHook):
  17. def __init__(self, global_batch_size, hvd_rank=-1):
  18. self.global_batch_size = global_batch_size
  19. self.hvd_rank = hvd_rank
  20. self.count = 0
  21. self.time_list = []
  22. def before_run(self, run_context):
  23. self.t0 = time.time()
  24. def after_run(self, run_context, run_values):
  25. elapsed_secs = time.time() - self.t0
  26. self.count += 1
  27. self.time_list.append(elapsed_secs)
  28. # report throughput during training
  29. class LogTrainRunHook(tf.estimator.SessionRunHook):
  30. def __init__(self, global_batch_size, hvd_rank=-1, save_checkpoints_steps=1000, num_steps_ignore_xla=100):
  31. self.global_batch_size = global_batch_size
  32. self.hvd_rank = hvd_rank
  33. self.save_checkpoints_steps = save_checkpoints_steps
  34. self.total_time = 0.0
  35. self.count = 0 # Holds number of iterations, including skipped iterations for fp16 loss scaling
  36. self.skipped = 0
  37. self.num_steps_ignore_xla = num_steps_ignore_xla
  38. #initial steps while xla is still compilingneed to be ignored from throughput computation
  39. def after_create_session(self, session, coord):
  40. self.init_global_step = session.run(tf.train.get_global_step())
  41. def before_run(self, run_context):
  42. self.t0 = time.time()
  43. return tf.estimator.SessionRunArgs(
  44. fetches=['step_update:0'])
  45. def after_run(self, run_context, run_values):
  46. elapsed_secs = time.time() - self.t0
  47. self.global_step = run_values.results[0]
  48. self.count += 1
  49. # Removing first 100 step + first five steps after every checkpoint save
  50. if (self.global_step - self.init_global_step) <= self.num_steps_ignore_xla or (self.global_step - self.init_global_step) % self.save_checkpoints_steps < 5:
  51. print("Skipping time record for ", self.global_step, " due to checkpoint-saving/warmup overhead")
  52. self.skipped += 1
  53. else:
  54. self.total_time += elapsed_secs