| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import math
- import os
- import time
- import numpy as np
- import argparse
- import torch
- from torch.utils.data import Dataset
- class CriteoBinDataset(Dataset):
- """Simple dataloader for a recommender system. Designed to work with a single binary file."""
- def __init__(self, data_file, batch_size=1, subset=None,
- numerical_features=13, categorical_features=26,
- data_type='int32', online_shuffle=True):
- self.data_type = np.__dict__[data_type]
- bytes_per_feature = self.data_type().nbytes
- self.tad_fea = 1 + numerical_features
- self.tot_fea = 1 + numerical_features + categorical_features
- self.batch_size = batch_size
- self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size)
- self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
- if subset is not None:
- if subset <= 0 or subset > 1:
- raise ValueError('Subset parameter must be in (0,1) range')
- self.num_entries = self.num_entries * subset
- print('data file:', data_file, 'number of batches:', self.num_entries)
- self.file = open(data_file, 'rb')
- self.online_shuffle=online_shuffle
- def __len__(self):
- return self.num_entries
- def __getitem__(self, idx):
- if idx == 0:
- self.file.seek(0, 0)
- if self.online_shuffle:
- self.file.seek(idx * self.bytes_per_entry, 0)
- raw_data = self.file.read(self.bytes_per_entry)
- array = np.frombuffer(raw_data, dtype=self.data_type).reshape(-1, self.tot_fea)
- # numerical features are encoded as float32
- numerical_features = array[:, 1:self.tad_fea].view(dtype=np.float32)
- numerical_features = torch.from_numpy(numerical_features)
- categorical_features = torch.from_numpy(array[:, self.tad_fea:])
- labels = torch.from_numpy(array[:, 0])
- return numerical_features, categorical_features, labels
- def __del__(self):
- self.file.close()
- if __name__ == '__main__':
- print('Dataloader benchmark')
- parser = argparse.ArgumentParser()
- parser.add_argument('--file', type=str)
- parser.add_argument('--batch_size', type=int)
- parser.add_argument('--steps', type=int, default=1000)
- args = parser.parse_args()
- dataset = CriteoBinDataset(data_file=args.file, batch_size=args.batch_size)
- begin = time.time()
- for i in range(args.steps):
- _ = dataset[i]
- end = time.time()
-
- step_time = (end - begin) / args.steps
- throughput = args.batch_size / step_time
- print(f'Mean step time: {step_time:.6f} [s]')
- print(f'Mean throughput: {throughput:,.0f} [samples / s]')
|