| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217 |
- # Copyright (c) 2018, deepakn94, codyaustun, robieta. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- # -----------------------------------------------------------------------
- #
- # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- from argparse import ArgumentParser
- import pandas as pd
- from load import implicit_load
- from feature_spec import FeatureSpec
- from neumf_constants import USER_CHANNEL_NAME, ITEM_CHANNEL_NAME, LABEL_CHANNEL_NAME, TEST_SAMPLES_PER_SERIES
- import torch
- import os
- import tqdm
- TEST_1 = 'test_data_1.pt'
- TEST_0 = 'test_data_0.pt'
- TRAIN_1 = 'train_data_1.pt'
- TRAIN_0 = 'train_data_0.pt'
- USER_COLUMN = 'user_id'
- ITEM_COLUMN = 'item_id'
- def parse_args():
- parser = ArgumentParser()
- parser.add_argument('--path', type=str, default='/data/ml-20m/ratings.csv',
- help='Path to reviews CSV file from MovieLens')
- parser.add_argument('--output', type=str, default='/data',
- help='Output directory for train and test files')
- parser.add_argument('--valid_negative', type=int, default=100,
- help='Number of negative samples for each positive test example')
- parser.add_argument('--seed', '-s', type=int, default=1,
- help='Manually set random seed for torch')
- return parser.parse_args()
- class _TestNegSampler:
- def __init__(self, train_ratings, nb_neg):
- self.nb_neg = nb_neg
- self.nb_users = int(train_ratings[:, 0].max()) + 1
- self.nb_items = int(train_ratings[:, 1].max()) + 1
- # compute unique ids for quickly created hash set and fast lookup
- ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
- self.set = set(ids)
- def generate(self, batch_size=128 * 1024):
- users = torch.arange(0, self.nb_users).reshape([1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)
- items = [-1] * len(users)
- random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
- print('Generating validation negatives...')
- for idx, u in enumerate(tqdm.tqdm(users.tolist())):
- if not random_items:
- random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
- j = random_items.pop()
- while u * self.nb_items + j in self.set:
- if not random_items:
- random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
- j = random_items.pop()
- items[idx] = j
- items = torch.LongTensor(items)
- return items
- def save_feature_spec(user_cardinality, item_cardinality, dtypes, test_negative_samples, output_path,
- user_feature_name='user',
- item_feature_name='item',
- label_feature_name='label'):
- feature_spec = {
- user_feature_name: {
- 'dtype': dtypes[user_feature_name],
- 'cardinality': int(user_cardinality)
- },
- item_feature_name: {
- 'dtype': dtypes[item_feature_name],
- 'cardinality': int(item_cardinality)
- },
- label_feature_name: {
- 'dtype': dtypes[label_feature_name],
- }
- }
- metadata = {
- TEST_SAMPLES_PER_SERIES: test_negative_samples + 1
- }
- train_mapping = [
- {
- 'type': 'torch_tensor',
- 'features': [
- user_feature_name,
- item_feature_name
- ],
- 'files': [TRAIN_0]
- },
- {
- 'type': 'torch_tensor',
- 'features': [
- label_feature_name
- ],
- 'files': [TRAIN_1]
- }
- ]
- test_mapping = [
- {
- 'type': 'torch_tensor',
- 'features': [
- user_feature_name,
- item_feature_name
- ],
- 'files': [TEST_0],
- },
- {
- 'type': 'torch_tensor',
- 'features': [
- label_feature_name
- ],
- 'files': [TEST_1],
- }
- ]
- channel_spec = {
- USER_CHANNEL_NAME: [user_feature_name],
- ITEM_CHANNEL_NAME: [item_feature_name],
- LABEL_CHANNEL_NAME: [label_feature_name]
- }
- source_spec = {'train': train_mapping, 'test': test_mapping}
- feature_spec = FeatureSpec(feature_spec=feature_spec, metadata=metadata, source_spec=source_spec,
- channel_spec=channel_spec, base_directory="")
- feature_spec.to_yaml(output_path=output_path)
- def main():
- args = parse_args()
- if args.seed is not None:
- torch.manual_seed(args.seed)
- print("Loading raw data from {}".format(args.path))
- df = implicit_load(args.path, sort=False)
- print("Mapping original user and item IDs to new sequential IDs")
- df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
- df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]
- user_cardinality = df[USER_COLUMN].max() + 1
- item_cardinality = df[ITEM_COLUMN].max() + 1
- # Need to sort before popping to get last item
- df.sort_values(by='timestamp', inplace=True)
- # clean up data
- del df['rating'], df['timestamp']
- df = df.drop_duplicates() # assuming it keeps order
- # Test set is the last interaction for a given user
- grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
- test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
- # Train set is all interactions but the last one
- train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])
- sampler = _TestNegSampler(train_data.values, args.valid_negative)
- test_negs = sampler.generate().cuda()
- test_negs = test_negs.reshape(-1, args.valid_negative)
- # Reshape train set into user,item,label tabular and save
- train_ratings = torch.from_numpy(train_data.values).cuda()
- train_labels = torch.ones_like(train_ratings[:, 0:1], dtype=torch.float32)
- torch.save(train_ratings, os.path.join(args.output, TRAIN_0))
- torch.save(train_labels, os.path.join(args.output, TRAIN_1))
- # Reshape test set into user,item,label tabular and save
- # All users have the same number of items, items for a given user appear consecutively
- test_ratings = torch.from_numpy(test_data.values).cuda()
- test_users_pos = test_ratings[:, 0:1] # slicing instead of indexing to keep dimensions
- test_items_pos = test_ratings[:, 1:2]
- test_users = test_users_pos.repeat_interleave(args.valid_negative + 1, dim=0)
- test_items = torch.cat((test_items_pos.reshape(-1, 1), test_negs), dim=1).reshape(-1, 1)
- positive_labels = torch.ones_like(test_users_pos, dtype=torch.float32)
- negative_labels = torch.zeros_like(test_users_pos, dtype=torch.float32).repeat(1, args.valid_negative)
- test_labels = torch.cat((positive_labels, negative_labels), dim=1).reshape(-1, 1)
- dtypes = {'user': str(test_users.dtype), 'item': str(test_items.dtype), 'label': str(test_labels.dtype)}
- test_tensor = torch.cat((test_users, test_items), dim=1)
- torch.save(test_tensor, os.path.join(args.output, TEST_0))
- torch.save(test_labels, os.path.join(args.output, TEST_1))
- save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes,
- test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml')
- if __name__ == '__main__':
- main()
|