feature_spec.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import yaml
  15. import os
  16. from typing import Dict
  17. from typing import List
  18. import numpy as np
  19. from .defaults import CATEGORICAL_CHANNEL, NUMERICAL_CHANNEL, LABEL_CHANNEL, \
  20. TRAIN_MAPPING, TEST_MAPPING, \
  21. CARDINALITY_SELECTOR, DTYPE_SELECTOR, \
  22. SPLIT_BINARY
  23. """ For performance reasons, numerical features are required to appear in the same order
  24. in both source_spec and channel_spec.
  25. For more detailed requirements, see the check_feature_spec method"""
  26. TYPE_SELECTOR = "type"
  27. FEATURES_SELECTOR = "features"
  28. FILES_SELECTOR = "files"
  29. class FeatureSpec:
  30. """
  31. This class contains the metadata necessary to find, interpret, load and dataset and supply it to the model.
  32. feature_spec section contains the definitions and per-feature metadata of features used in the model
  33. source_spec contains the specifics of how the feature data is sourced. It is a dict of configurations, each
  34. providing an instance of the dataset, for example a train or test part
  35. channel_spec the configuration of which features are used by which channels of the model
  36. metadata is an optional dictionary of additional, dataset-wide metadata
  37. base_directory is the path relative to which all paths contained in FeatureSpec are interpreted
  38. """
  39. def __init__(self, feature_spec=None, source_spec=None, channel_spec=None, metadata=None, base_directory=None):
  40. self.feature_spec: Dict = feature_spec if feature_spec is not None else {}
  41. self.source_spec: Dict = source_spec if source_spec is not None else {}
  42. self.channel_spec: Dict = channel_spec if channel_spec is not None else {}
  43. self.metadata: Dict = metadata if metadata is not None else {}
  44. self.base_directory: str = base_directory
  45. @classmethod
  46. def from_yaml(cls, path):
  47. with open(path, 'r') as feature_spec_file:
  48. base_directory = os.path.dirname(path)
  49. feature_spec = yaml.safe_load(feature_spec_file)
  50. return cls.from_dict(feature_spec, base_directory=base_directory)
  51. @classmethod
  52. def from_dict(cls, source_dict, base_directory):
  53. return cls(base_directory=base_directory, **source_dict)
  54. def to_dict(self) -> Dict:
  55. attributes_to_dump = ['feature_spec', 'source_spec', 'channel_spec', 'metadata']
  56. return {attr: self.__dict__[attr] for attr in attributes_to_dump}
  57. def to_string(self):
  58. return yaml.dump(self.to_dict())
  59. def to_yaml(self, output_path=None):
  60. if not output_path:
  61. output_path = self.base_directory + '/feature_spec.yaml'
  62. with open(output_path, 'w') as output_file:
  63. print(yaml.dump(self.to_dict()), file=output_file)
  64. def get_number_of_numerical_features(self) -> int:
  65. numerical_features = self.channel_spec[NUMERICAL_CHANNEL]
  66. return len(numerical_features)
  67. def cat_positions_to_names(self, positions: List[int]):
  68. # Ordering needs to correspond to the one in get_categorical_sizes()
  69. feature_names = self.get_categorical_feature_names()
  70. return [feature_names[i] for i in positions]
  71. def get_categorical_feature_names(self):
  72. """ Provides the categorical feature names. The returned order should me maintained."""
  73. return self.channel_spec[CATEGORICAL_CHANNEL]
  74. def get_categorical_sizes(self) -> List[int]:
  75. """For a given feature spec, this function is expected to return the sizes in the order corresponding to the
  76. order in the channel_spec section """
  77. categorical_features = self.get_categorical_feature_names()
  78. cardinalities = [self.feature_spec[feature_name][CARDINALITY_SELECTOR] for feature_name in
  79. categorical_features]
  80. return cardinalities
  81. # *** Feature Spec checking *** #
  82. def _check_feature_spec_general(self):
  83. # check that correct dtypes are provided for all features
  84. for feature_dict in self.feature_spec.values():
  85. assert DTYPE_SELECTOR in feature_dict
  86. try:
  87. np.dtype(feature_dict[DTYPE_SELECTOR])
  88. except TypeError:
  89. assert False, "Type not understood by numpy"
  90. def _check_source_spec_section_model_specific(self):
  91. set_of_categorical_features = set(self.channel_spec[CATEGORICAL_CHANNEL])
  92. set_of_numerical_features = set(self.channel_spec[NUMERICAL_CHANNEL])
  93. set_of_label_features = set(self.channel_spec[LABEL_CHANNEL])
  94. numerical_features_list = self.channel_spec[NUMERICAL_CHANNEL]
  95. # check that mappings are the ones expected
  96. mapping_name_list = list(self.source_spec.keys())
  97. assert sorted(mapping_name_list) == sorted([TEST_MAPPING, TRAIN_MAPPING])
  98. for mapping_name in [TRAIN_MAPPING, TEST_MAPPING]:
  99. mapping = self.source_spec[mapping_name]
  100. mapping_features = set()
  101. for chunk in mapping:
  102. # check that chunk has the correct type
  103. assert chunk[TYPE_SELECTOR] == SPLIT_BINARY
  104. contained_features = chunk[FEATURES_SELECTOR]
  105. containing_files = chunk[FILES_SELECTOR]
  106. # check that features are unique in mapping
  107. for feature in contained_features:
  108. assert feature not in mapping_features
  109. mapping_features.add(feature)
  110. # check that chunk has at least one features
  111. assert len(contained_features) >= 1
  112. # check that chunk has exactly file
  113. assert len(containing_files) == 1
  114. first_feature = contained_features[0]
  115. if first_feature in set_of_categorical_features:
  116. # check that each categorical feature is in a different file
  117. assert len(contained_features) == 1
  118. # check that the type is one of the supported
  119. assert self.feature_spec[first_feature][DTYPE_SELECTOR] in {'int8', 'int16', 'int32'}
  120. elif first_feature in set_of_numerical_features:
  121. # check that numerical features are all in one chunk
  122. assert sorted(contained_features) == sorted(numerical_features_list)
  123. # check that ordering is exactly same as in channel spec - required for performance
  124. assert contained_features == numerical_features_list
  125. # check numerical dtype
  126. for feature in contained_features:
  127. assert np.dtype(self.feature_spec[feature][DTYPE_SELECTOR]) == np.float16
  128. elif first_feature in set_of_label_features:
  129. # check that label feature is in a separate chunk
  130. assert len(contained_features) == 1
  131. # check label dtype
  132. assert np.dtype(self.feature_spec[first_feature][DTYPE_SELECTOR]) == bool
  133. else:
  134. assert False, "Feature of unknown type"
  135. # check that all features appeared in mapping
  136. assert sorted(mapping_features) == sorted(list(self.feature_spec.keys()))
  137. def _check_channel_spec_section_model_specific(self):
  138. categorical_features_list = self.channel_spec[CATEGORICAL_CHANNEL]
  139. numerical_features_list = self.channel_spec[NUMERICAL_CHANNEL]
  140. label_features_list = self.channel_spec[LABEL_CHANNEL]
  141. set_of_categorical_features = set(categorical_features_list)
  142. set_of_numerical_features = set(numerical_features_list)
  143. # check that exactly one label feature is selected
  144. assert len(label_features_list) == 1
  145. label_feature_name = label_features_list[0]
  146. # check that channels are the ones expected
  147. channel_name_list = list(self.channel_spec.keys())
  148. assert sorted(channel_name_list) == sorted([CATEGORICAL_CHANNEL, NUMERICAL_CHANNEL, LABEL_CHANNEL])
  149. # check that all features used in channel spec are exactly ones defined in feature_spec
  150. feature_spec_features = list(self.feature_spec.keys())
  151. channel_spec_features = list(set.union(set_of_categorical_features,
  152. set_of_numerical_features,
  153. {label_feature_name}))
  154. assert sorted(feature_spec_features) == sorted(channel_spec_features)
  155. # check that lists in channel spec contain unique names
  156. assert sorted(list(set_of_categorical_features)) == sorted(categorical_features_list)
  157. assert sorted(list(set_of_numerical_features)) == sorted(numerical_features_list)
  158. def _check_feature_spec_section_model_specific(self):
  159. # check that categorical features have cardinality provided
  160. set_of_categorical_features = set(self.channel_spec[CATEGORICAL_CHANNEL])
  161. for feature_name, feature_dict in self.feature_spec.items():
  162. if feature_name in set_of_categorical_features:
  163. assert CARDINALITY_SELECTOR in feature_dict
  164. assert isinstance(feature_dict[CARDINALITY_SELECTOR], int)
  165. def _check_feature_spec_model_specific(self):
  166. self._check_channel_spec_section_model_specific()
  167. self._check_feature_spec_section_model_specific()
  168. self._check_source_spec_section_model_specific()
  169. def check_feature_spec(self):
  170. self._check_feature_spec_general()
  171. self._check_feature_spec_model_specific()
  172. # TODO check if cardinality fits in dtype, check if base directory is set
  173. @staticmethod
  174. def get_default_feature_spec(number_of_numerical_features, categorical_feature_cardinalities):
  175. numerical_feature_fstring = "num_{}"
  176. categorical_feature_fstring = "cat_{}.bin"
  177. label_feature_name = "label"
  178. numerical_file_name = "numerical.bin"
  179. categorical_file_fstring = "{}" # TODO remove .bin from feature name, add to file name
  180. label_file_name = "label.bin"
  181. number_of_categorical_features = len(categorical_feature_cardinalities)
  182. numerical_feature_names = [numerical_feature_fstring.format(i) for i in range(number_of_numerical_features)]
  183. categorical_feature_names = [categorical_feature_fstring.format(i) for i in
  184. range(number_of_categorical_features)]
  185. cat_feature_types = [get_categorical_feature_type(int(cat_size)) for cat_size in
  186. categorical_feature_cardinalities]
  187. feature_dict = {f_name: {DTYPE_SELECTOR: str(np.dtype(f_type)), CARDINALITY_SELECTOR: f_size}
  188. for f_name, f_type, f_size in
  189. zip(categorical_feature_names, cat_feature_types, categorical_feature_cardinalities)}
  190. for f_name in numerical_feature_names:
  191. feature_dict[f_name] = {DTYPE_SELECTOR: str(np.dtype(np.float16))}
  192. feature_dict[label_feature_name] = {DTYPE_SELECTOR: str(np.dtype(bool))}
  193. channel_spec = {CATEGORICAL_CHANNEL: categorical_feature_names,
  194. NUMERICAL_CHANNEL: numerical_feature_names,
  195. LABEL_CHANNEL: [label_feature_name]}
  196. source_spec = {}
  197. for filename in (TRAIN_MAPPING, TEST_MAPPING):
  198. source_spec[filename] = []
  199. dst_folder = filename
  200. numerical_file_path = os.path.join(dst_folder, numerical_file_name)
  201. source_spec[filename].append({TYPE_SELECTOR: SPLIT_BINARY,
  202. FEATURES_SELECTOR: numerical_feature_names,
  203. FILES_SELECTOR: [numerical_file_path]})
  204. label_file_path = os.path.join(dst_folder, label_file_name)
  205. source_spec[filename].append({TYPE_SELECTOR: SPLIT_BINARY,
  206. FEATURES_SELECTOR: [label_feature_name],
  207. FILES_SELECTOR: [label_file_path]})
  208. for feature_name in categorical_feature_names:
  209. categorical_file_name = categorical_file_fstring.format(feature_name)
  210. categorical_file_path = os.path.join(dst_folder, categorical_file_name)
  211. source_spec[filename].append({TYPE_SELECTOR: SPLIT_BINARY,
  212. FEATURES_SELECTOR: [feature_name],
  213. FILES_SELECTOR: [categorical_file_path]})
  214. return FeatureSpec(feature_spec=feature_dict, source_spec=source_spec, channel_spec=channel_spec, metadata={})
  215. def get_mapping_paths(self, mapping_name: str):
  216. label_feature_name = self.channel_spec[LABEL_CHANNEL][0]
  217. set_of_categorical_features = set(self.channel_spec[CATEGORICAL_CHANNEL])
  218. set_of_numerical_features = set(self.channel_spec[NUMERICAL_CHANNEL])
  219. label_path = None
  220. numerical_path = None
  221. categorical_paths = dict()
  222. for chunk in self.source_spec[mapping_name]:
  223. local_path = os.path.join(self.base_directory, chunk[FILES_SELECTOR][0])
  224. if chunk[FEATURES_SELECTOR][0] in set_of_numerical_features:
  225. numerical_path = local_path
  226. elif chunk[FEATURES_SELECTOR][0] in set_of_categorical_features:
  227. local_feature = chunk[FEATURES_SELECTOR][0]
  228. categorical_paths[local_feature] = local_path
  229. elif chunk[FEATURES_SELECTOR][0] == label_feature_name:
  230. label_path = local_path
  231. return label_path, numerical_path, categorical_paths
  232. def get_categorical_feature_type(size: int):
  233. """This function works both when max value and cardinality is passed.
  234. Consistency by the user is required"""
  235. types = (np.int8, np.int16, np.int32)
  236. for numpy_type in types:
  237. if size < np.iinfo(numpy_type).max:
  238. return numpy_type
  239. raise RuntimeError(f"Categorical feature of size {size} is too big for defined types")