save_load.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import errno
  15. import os
  16. import re
  17. import shutil
  18. import tempfile
  19. import logging
  20. import paddle
  21. _PDOPT_SUFFIX = '.pdopt'
  22. _PDPARAMS_SUFFIX = '.pdparams'
  23. def _mkdir_if_not_exist(path):
  24. """
  25. Mkdir if not exists, ignore the exception when multiprocess mkdir together.
  26. """
  27. if not os.path.exists(path):
  28. try:
  29. os.makedirs(path)
  30. except OSError as e:
  31. if e.errno == errno.EEXIST and os.path.isdir(path):
  32. logging.warning(
  33. 'be happy if some process has already created %s', path)
  34. else:
  35. raise OSError(f'Failed to mkdir {path}')
  36. def _load_state(path):
  37. """
  38. Load model parameters from .pdparams file.
  39. Args:
  40. path(str): Path to .pdparams file.
  41. Returns:
  42. state(dict): Dict of parameters loaded from file.
  43. """
  44. if os.path.exists(path + _PDOPT_SUFFIX):
  45. tmp = tempfile.mkdtemp()
  46. dst = os.path.join(tmp, os.path.basename(os.path.normpath(path)))
  47. shutil.copy(path + _PDPARAMS_SUFFIX, dst + _PDPARAMS_SUFFIX)
  48. state = paddle.static.load_program_state(dst)
  49. shutil.rmtree(tmp)
  50. else:
  51. state = paddle.static.load_program_state(path)
  52. return state
  53. def load_params(prog, path, ignore_params=None):
  54. """
  55. Load model from the given path.
  56. Args:
  57. prog (paddle.static.Program): Load weight to which Program object.
  58. path (string): Model path.
  59. ignore_params (list): Ignore variable to load when finetuning.
  60. """
  61. if not (os.path.isdir(path) or os.path.exists(path + _PDPARAMS_SUFFIX)):
  62. raise ValueError(f"Model pretrain path {path} does not exists.")
  63. logging.info("Loading parameters from %s...", path)
  64. ignore_set = set()
  65. state = _load_state(path)
  66. # ignore the parameter which mismatch the shape
  67. # between the model and pretrain weight.
  68. all_var_shape = {}
  69. for block in prog.blocks:
  70. for param in block.all_parameters():
  71. all_var_shape[param.name] = param.shape
  72. ignore_set.update([
  73. name for name, shape in all_var_shape.items()
  74. if name in state and shape != state[name].shape
  75. ])
  76. if ignore_params:
  77. all_var_names = [var.name for var in prog.list_vars()]
  78. ignore_list = filter(
  79. lambda var: any([re.match(name, var) for name in ignore_params]),
  80. all_var_names)
  81. ignore_set.update(list(ignore_list))
  82. if len(ignore_set) > 0:
  83. for k in ignore_set:
  84. if k in state:
  85. logging.warning(
  86. 'variable %s is already excluded automatically', k)
  87. del state[k]
  88. paddle.static.set_program_state(prog, state)
  89. def init_ckpt(path_to_ckpt, program, exe):
  90. """
  91. Init from checkpoints or pretrained model in given path.
  92. Args:
  93. path_to_ckpt(str): The path to files of checkpoints,
  94. including '.pdparams' and '.pdopt'.
  95. program(paddle.static.Program): The program to init model.
  96. exe(paddle.static.Executor): The executor to run program.
  97. """
  98. paddle.static.load(program, path_to_ckpt, exe)
  99. logging.info("Finish initalizing the checkpoint from %s", path_to_ckpt)
  100. def init_pretrained(path_to_pretrained, program):
  101. """
  102. Init from checkpoints or pretrained model in given path.
  103. Args:
  104. path_to_pretrained(str): The path to file of pretrained model.
  105. program(paddle.static.Program): The program to init model.
  106. """
  107. if not isinstance(path_to_pretrained, list):
  108. pretrained_model = [path_to_pretrained]
  109. for pretrain in pretrained_model:
  110. load_params(program, pretrain)
  111. logging.info("Finish initalizing pretrained parameters from %s",
  112. pretrained_model)
  113. def init_program(args, program, exe):
  114. """
  115. Init from given checkpoint or pretrained parameters .
  116. Args:
  117. args(Namespace): Arguments obtained from ArgumentParser.
  118. program(paddle.static.Program): The program to init model.
  119. exe(paddle.static.Executor): The executor to run program.
  120. """
  121. if args.from_checkpoint is not None:
  122. init_ckpt(args.from_checkpoint, program, exe)
  123. logging.info("Training will start at the %d-th epoch",
  124. args.start_epoch)
  125. elif args.from_pretrained_params is not None:
  126. init_pretrained(args.from_pretrained_params, program)
  127. def save_model(program, model_path, epoch_id, prefix):
  128. """
  129. Save a model to given path.
  130. Args:
  131. program(paddle.static.Program): The program to be saved.
  132. model_path(str): The path to save model.
  133. epoch_id(int): The current epoch id.
  134. """
  135. if paddle.distributed.get_rank() != 0:
  136. return
  137. model_path = os.path.join(model_path, str(epoch_id))
  138. _mkdir_if_not_exist(model_path)
  139. model_prefix = os.path.join(model_path, prefix)
  140. paddle.static.save(program, model_prefix)
  141. logging.info("Already save model in %s", model_path)