| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- # Copyright (c) 2018, deepakn94, codyaustun, robieta. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- # -----------------------------------------------------------------------
- #
- # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- from argparse import ArgumentParser
- import pandas as pd
- import numpy as np
- from load import implicit_load
- from convert import save_feature_spec, _TestNegSampler, TEST_0, TEST_1, TRAIN_0, TRAIN_1
- import torch
- import os
- USER_COLUMN = 'user_id'
- ITEM_COLUMN = 'item_id'
- def parse_args():
- parser = ArgumentParser()
- parser.add_argument('--path', type=str, default='/data/ml-20m/ratings.csv',
- help='Path to reviews CSV file from MovieLens')
- parser.add_argument('--output', type=str, default='/data',
- help='Output directory for train and test files')
- parser.add_argument('--valid_negative', type=int, default=100,
- help='Number of negative samples for each positive test example')
- parser.add_argument('--seed', '-s', type=int, default=1,
- help='Manually set random seed for torch')
- parser.add_argument('--test', type=str, help='select modification to be applied to the set')
- return parser.parse_args()
- def main():
- args = parse_args()
- if args.seed is not None:
- torch.manual_seed(args.seed)
- print("Loading raw data from {}".format(args.path))
- df = implicit_load(args.path, sort=False)
- if args.test == 'less_user':
- to_drop = set(list(df[USER_COLUMN].unique())[-100:])
- df = df[~df[USER_COLUMN].isin(to_drop)]
- if args.test == 'less_item':
- to_drop = set(list(df[ITEM_COLUMN].unique())[-100:])
- df = df[~df[ITEM_COLUMN].isin(to_drop)]
- if args.test == 'more_user':
- sample = df.sample(frac=0.2).copy()
- sample[USER_COLUMN] = sample[USER_COLUMN] + 10000000
- df = df.append(sample)
- users = df[USER_COLUMN]
- df = df[users.isin(users[users.duplicated(keep=False)])] # make sure something remains in the train set
- if args.test == 'more_item':
- sample = df.sample(frac=0.2).copy()
- sample[ITEM_COLUMN] = sample[ITEM_COLUMN] + 10000000
- df = df.append(sample)
- print("Mapping original user and item IDs to new sequential IDs")
- df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
- df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]
- user_cardinality = df[USER_COLUMN].max() + 1
- item_cardinality = df[ITEM_COLUMN].max() + 1
- # Need to sort before popping to get last item
- df.sort_values(by='timestamp', inplace=True)
- # clean up data
- del df['rating'], df['timestamp']
- df = df.drop_duplicates() # assuming it keeps order
- # Test set is the last interaction for a given user
- grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
- test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
- # Train set is all interactions but the last one
- train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])
- sampler = _TestNegSampler(train_data.values, args.valid_negative)
- test_negs = sampler.generate().cuda()
- if args.valid_negative > 0:
- test_negs = test_negs.reshape(-1, args.valid_negative)
- else:
- test_negs = test_negs.reshape(test_data.shape[0], 0)
- if args.test == 'more_pos':
- mask = np.random.rand(len(test_data)) < 0.5
- sample = test_data[mask].copy()
- sample[ITEM_COLUMN] = sample[ITEM_COLUMN] + 5
- test_data = test_data.append(sample)
- test_negs_copy = test_negs[mask]
- test_negs = torch.cat((test_negs, test_negs_copy), dim=0)
- if args.test == 'less_pos':
- mask = np.random.rand(len(test_data)) < 0.5
- test_data = test_data[mask]
- test_negs = test_negs[mask]
- # Reshape train set into user,item,label tabular and save
- train_ratings = torch.from_numpy(train_data.values).cuda()
- train_labels = torch.ones_like(train_ratings[:, 0:1], dtype=torch.float32)
- torch.save(train_ratings, os.path.join(args.output, TRAIN_0))
- torch.save(train_labels, os.path.join(args.output, TRAIN_1))
- # Reshape test set into user,item,label tabular and save
- # All users have the same number of items, items for a given user appear consecutively
- test_ratings = torch.from_numpy(test_data.values).cuda()
- test_users_pos = test_ratings[:, 0:1] # slicing instead of indexing to keep dimensions
- test_items_pos = test_ratings[:, 1:2]
- test_users = test_users_pos.repeat_interleave(args.valid_negative + 1, dim=0)
- test_items = torch.cat((test_items_pos.reshape(-1, 1), test_negs), dim=1).reshape(-1, 1)
- positive_labels = torch.ones_like(test_users_pos, dtype=torch.float32)
- negative_labels = torch.zeros_like(test_users_pos, dtype=torch.float32).repeat(1, args.valid_negative)
- test_labels = torch.cat((positive_labels, negative_labels), dim=1).reshape(-1, 1)
- dtypes = {'user': str(test_users.dtype), 'item': str(test_items.dtype), 'label': str(test_labels.dtype)}
- test_tensor = torch.cat((test_users, test_items), dim=1)
- torch.save(test_tensor, os.path.join(args.output, TEST_0))
- torch.save(test_labels, os.path.join(args.output, TEST_1))
- if args.test == 'other_names':
- dtypes = {'user_2': str(test_users.dtype),
- 'item_2': str(test_items.dtype),
- 'label_2': str(test_labels.dtype)}
- save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes,
- test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml',
- user_feature_name='user_2',
- item_feature_name='item_2',
- label_feature_name='label_2')
- else:
- save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes,
- test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml')
- if __name__ == '__main__':
- main()
|