setup.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import dllogger as logger
  16. import tensorflow as tf
  17. import horovod.tensorflow as hvd
  18. import numpy as np
  19. from dllogger import StdOutBackend, Verbosity, JSONStreamBackend
  20. from utils.model_fn import unet_fn
  21. def set_flags():
  22. os.environ['CUDA_CACHE_DISABLE'] = '1'
  23. os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
  24. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
  25. os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
  26. os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '0'
  27. os.environ['TF_ADJUST_HUE_FUSED'] = '1'
  28. os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
  29. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
  30. os.environ['TF_SYNC_ON_FINISH'] = '0'
  31. os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
  32. def prepare_model_dir(params):
  33. model_dir = os.path.join(params.model_dir, "model_checkpoint")
  34. model_dir = model_dir if (hvd.rank() == 0 and not params.benchmark) else None
  35. if model_dir is not None:
  36. os.makedirs(model_dir, exist_ok=True)
  37. if ('train' in params.exec_mode) and (not params.resume_training):
  38. os.system('rm -rf {}/*'.format(model_dir))
  39. return model_dir
  40. def build_estimator(params, model_dir):
  41. if params.use_amp:
  42. os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
  43. else:
  44. os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '0'
  45. np.random.seed(params.seed)
  46. tf.compat.v1.random.set_random_seed(params.seed)
  47. tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
  48. gpu_options = tf.compat.v1.GPUOptions()
  49. config = tf.compat.v1.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)
  50. if params.use_xla:
  51. config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
  52. config.gpu_options.allow_growth = True
  53. config.gpu_options.visible_device_list = str(hvd.local_rank())
  54. run_config = tf.estimator.RunConfig(
  55. save_summary_steps=1,
  56. tf_random_seed=params.seed,
  57. session_config=config,
  58. save_checkpoints_steps=(params.max_steps // hvd.size()) if hvd.rank() == 0 else None,
  59. keep_checkpoint_max=1)
  60. estimator = tf.estimator.Estimator(
  61. model_fn=unet_fn,
  62. model_dir=model_dir,
  63. config=run_config,
  64. params=params)
  65. return estimator
  66. def get_logger(params):
  67. backends = []
  68. if hvd.rank() == 0:
  69. backends += [StdOutBackend(Verbosity.VERBOSE)]
  70. if params.log_dir:
  71. backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)]
  72. logger.init(backends=backends)
  73. logger.metadata("eval_dice_score", {"unit": None})
  74. logger.metadata("throughput_test", {"unit": "images/s"})
  75. logger.metadata("throughput_train", {"unit": "images/s"})
  76. return logger