SunnyMirror
/
DeepLearningExamples
дзеркало https://github.com/NVIDIA/DeepLearningExamples.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
							# Copyright (c) 2018, deepakn94, codyaustun, robieta. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# -----------------------------------------------------------------------
#
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from argparse import ArgumentParser
import pandas as pd
import numpy as np
from load import implicit_load
from convert import save_feature_spec, _TestNegSampler, TEST_0, TEST_1, TRAIN_0, TRAIN_1
import torch
import os

USER_COLUMN = 'user_id'
ITEM_COLUMN = 'item_id'


def parse_args():
    parser = ArgumentParser()
    parser.add_argument('--path', type=str, default='/data/ml-20m/ratings.csv',
                        help='Path to reviews CSV file from MovieLens')
    parser.add_argument('--output', type=str, default='/data',
                        help='Output directory for train and test files')
    parser.add_argument('--valid_negative', type=int, default=100,
                        help='Number of negative samples for each positive test example')
    parser.add_argument('--seed', '-s', type=int, default=1,
                        help='Manually set random seed for torch')
    parser.add_argument('--test', type=str, help='select modification to be applied to the set')
    return parser.parse_args()


def main():
    args = parse_args()

    if args.seed is not None:
        torch.manual_seed(args.seed)

    print("Loading raw data from {}".format(args.path))
    df = implicit_load(args.path, sort=False)

    if args.test == 'less_user':
        to_drop = set(list(df[USER_COLUMN].unique())[-100:])
        df = df[~df[USER_COLUMN].isin(to_drop)]
    if args.test == 'less_item':
        to_drop = set(list(df[ITEM_COLUMN].unique())[-100:])
        df = df[~df[ITEM_COLUMN].isin(to_drop)]
    if args.test == 'more_user':
        sample = df.sample(frac=0.2).copy()
        sample[USER_COLUMN] = sample[USER_COLUMN] + 10000000
        df = df.append(sample)
        users = df[USER_COLUMN]
        df = df[users.isin(users[users.duplicated(keep=False)])]  # make sure something remains in the train set
    if args.test == 'more_item':
        sample = df.sample(frac=0.2).copy()
        sample[ITEM_COLUMN] = sample[ITEM_COLUMN] + 10000000
        df = df.append(sample)

    print("Mapping original user and item IDs to new sequential IDs")
    df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
    df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]

    user_cardinality = df[USER_COLUMN].max() + 1
    item_cardinality = df[ITEM_COLUMN].max() + 1

    # Need to sort before popping to get last item
    df.sort_values(by='timestamp', inplace=True)

    # clean up data
    del df['rating'], df['timestamp']
    df = df.drop_duplicates()  # assuming it keeps order

    # Test set is the last interaction for a given user
    grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
    test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
    # Train set is all interactions but the last one
    train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

    sampler = _TestNegSampler(train_data.values, args.valid_negative)
    test_negs = sampler.generate().cuda()
    if args.valid_negative > 0:
        test_negs = test_negs.reshape(-1, args.valid_negative)
    else:
        test_negs = test_negs.reshape(test_data.shape[0], 0)

    if args.test == 'more_pos':
        mask = np.random.rand(len(test_data)) < 0.5
        sample = test_data[mask].copy()
        sample[ITEM_COLUMN] = sample[ITEM_COLUMN] + 5
        test_data = test_data.append(sample)
        test_negs_copy = test_negs[mask]
        test_negs = torch.cat((test_negs, test_negs_copy), dim=0)
    if args.test == 'less_pos':
        mask = np.random.rand(len(test_data)) < 0.5
        test_data = test_data[mask]
        test_negs = test_negs[mask]

    # Reshape train set into user,item,label tabular and save
    train_ratings = torch.from_numpy(train_data.values).cuda()
    train_labels = torch.ones_like(train_ratings[:, 0:1], dtype=torch.float32)
    torch.save(train_ratings, os.path.join(args.output, TRAIN_0))
    torch.save(train_labels, os.path.join(args.output, TRAIN_1))

    # Reshape test set into user,item,label tabular and save
    # All users have the same number of items, items for a given user appear consecutively
    test_ratings = torch.from_numpy(test_data.values).cuda()
    test_users_pos = test_ratings[:, 0:1]  # slicing instead of indexing to keep dimensions
    test_items_pos = test_ratings[:, 1:2]
    test_users = test_users_pos.repeat_interleave(args.valid_negative + 1, dim=0)
    test_items = torch.cat((test_items_pos.reshape(-1, 1), test_negs), dim=1).reshape(-1, 1)
    positive_labels = torch.ones_like(test_users_pos, dtype=torch.float32)
    negative_labels = torch.zeros_like(test_users_pos, dtype=torch.float32).repeat(1, args.valid_negative)
    test_labels = torch.cat((positive_labels, negative_labels), dim=1).reshape(-1, 1)
    dtypes = {'user': str(test_users.dtype), 'item': str(test_items.dtype), 'label': str(test_labels.dtype)}
    test_tensor = torch.cat((test_users, test_items), dim=1)
    torch.save(test_tensor, os.path.join(args.output, TEST_0))
    torch.save(test_labels, os.path.join(args.output, TEST_1))

    if args.test == 'other_names':
        dtypes = {'user_2': str(test_users.dtype),
                  'item_2': str(test_items.dtype),
                  'label_2': str(test_labels.dtype)}
        save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes,
                          test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml',
                          user_feature_name='user_2',
                          item_feature_name='item_2',
                          label_feature_name='label_2')
    else:
        save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes,
                          test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml')


if __name__ == '__main__':
    main()