data_loader.py 3.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import math
  15. import os
  16. import time
  17. import numpy as np
  18. import argparse
  19. import torch
  20. from torch.utils.data import Dataset
  21. class CriteoBinDataset(Dataset):
  22. """Simple dataloader for a recommender system. Designed to work with a single binary file."""
  23. def __init__(self, data_file, batch_size=1, subset=None,
  24. numerical_features=13, categorical_features=26,
  25. data_type='int32', online_shuffle=True):
  26. self.data_type = np.__dict__[data_type]
  27. bytes_per_feature = self.data_type().nbytes
  28. self.tad_fea = 1 + numerical_features
  29. self.tot_fea = 1 + numerical_features + categorical_features
  30. self.batch_size = batch_size
  31. self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size)
  32. self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
  33. if subset is not None:
  34. if subset <= 0 or subset > 1:
  35. raise ValueError('Subset parameter must be in (0,1) range')
  36. self.num_entries = self.num_entries * subset
  37. print('data file:', data_file, 'number of batches:', self.num_entries)
  38. self.file = open(data_file, 'rb')
  39. self.online_shuffle=online_shuffle
  40. def __len__(self):
  41. return self.num_entries
  42. def __getitem__(self, idx):
  43. if idx == 0:
  44. self.file.seek(0, 0)
  45. if self.online_shuffle:
  46. self.file.seek(idx * self.bytes_per_entry, 0)
  47. raw_data = self.file.read(self.bytes_per_entry)
  48. array = np.frombuffer(raw_data, dtype=self.data_type).reshape(-1, self.tot_fea)
  49. # numerical features are encoded as float32
  50. numerical_features = array[:, 1:self.tad_fea].view(dtype=np.float32)
  51. numerical_features = torch.from_numpy(numerical_features)
  52. categorical_features = torch.from_numpy(array[:, self.tad_fea:])
  53. labels = torch.from_numpy(array[:, 0])
  54. return numerical_features, categorical_features, labels
  55. def __del__(self):
  56. self.file.close()
  57. if __name__ == '__main__':
  58. print('Dataloader benchmark')
  59. parser = argparse.ArgumentParser()
  60. parser.add_argument('--file', type=str)
  61. parser.add_argument('--batch_size', type=int)
  62. parser.add_argument('--steps', type=int, default=1000)
  63. args = parser.parse_args()
  64. dataset = CriteoBinDataset(data_file=args.file, batch_size=args.batch_size)
  65. begin = time.time()
  66. for i in range(args.steps):
  67. _ = dataset[i]
  68. end = time.time()
  69. step_time = (end - begin) / args.steps
  70. throughput = args.batch_size / step_time
  71. print(f'Mean step time: {step_time:.6f} [s]')
  72. print(f'Mean throughput: {throughput:,.0f} [samples / s]')