SunnyMirror
/
DeepLearningExamples
réplica de https://github.com/NVIDIA/DeepLearningExamples.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
							# Copyright (c) 2018, deepakn94, codyaustun, robieta. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# -----------------------------------------------------------------------
#
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from argparse import ArgumentParser
import pandas as pd
from load import implicit_load
from feature_spec import FeatureSpec
from neumf_constants import USER_CHANNEL_NAME, ITEM_CHANNEL_NAME, LABEL_CHANNEL_NAME, TEST_SAMPLES_PER_SERIES
import torch
import os
import tqdm

TEST_1 = 'test_data_1.pt'
TEST_0 = 'test_data_0.pt'
TRAIN_1 = 'train_data_1.pt'
TRAIN_0 = 'train_data_0.pt'

USER_COLUMN = 'user_id'
ITEM_COLUMN = 'item_id'


def parse_args():
    parser = ArgumentParser()
    parser.add_argument('--path', type=str, default='/data/ml-20m/ratings.csv',
                        help='Path to reviews CSV file from MovieLens')
    parser.add_argument('--output', type=str, default='/data',
                        help='Output directory for train and test files')
    parser.add_argument('--valid_negative', type=int, default=100,
                        help='Number of negative samples for each positive test example')
    parser.add_argument('--seed', '-s', type=int, default=1,
                        help='Manually set random seed for torch')
    return parser.parse_args()


class _TestNegSampler:
    def __init__(self, train_ratings, nb_neg):
        self.nb_neg = nb_neg
        self.nb_users = int(train_ratings[:, 0].max()) + 1
        self.nb_items = int(train_ratings[:, 1].max()) + 1

        # compute unique ids for quickly created hash set and fast lookup
        ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
        self.set = set(ids)

    def generate(self, batch_size=128 * 1024):
        users = torch.arange(0, self.nb_users).reshape([1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)

        items = [-1] * len(users)

        random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
        print('Generating validation negatives...')
        for idx, u in enumerate(tqdm.tqdm(users.tolist())):
            if not random_items:
                random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
            j = random_items.pop()
            while u * self.nb_items + j in self.set:
                if not random_items:
                    random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
                j = random_items.pop()

            items[idx] = j
        items = torch.LongTensor(items)
        return items


def save_feature_spec(user_cardinality, item_cardinality, dtypes, test_negative_samples, output_path,
                      user_feature_name='user',
                      item_feature_name='item',
                      label_feature_name='label'):
    feature_spec = {
        user_feature_name: {
            'dtype': dtypes[user_feature_name],
            'cardinality': int(user_cardinality)
        },
        item_feature_name: {
            'dtype': dtypes[item_feature_name],
            'cardinality': int(item_cardinality)
        },
        label_feature_name: {
            'dtype': dtypes[label_feature_name],
        }
    }
    metadata = {
        TEST_SAMPLES_PER_SERIES: test_negative_samples + 1
    }
    train_mapping = [
        {
            'type': 'torch_tensor',
            'features': [
                user_feature_name,
                item_feature_name
            ],
            'files': [TRAIN_0]
        },
        {
            'type': 'torch_tensor',
            'features': [
                label_feature_name
            ],
            'files': [TRAIN_1]
        }
    ]
    test_mapping = [
        {
            'type': 'torch_tensor',
            'features': [
                user_feature_name,
                item_feature_name
            ],
            'files': [TEST_0],
        },
        {
            'type': 'torch_tensor',
            'features': [
                label_feature_name
            ],
            'files': [TEST_1],
        }
    ]
    channel_spec = {
        USER_CHANNEL_NAME: [user_feature_name],
        ITEM_CHANNEL_NAME: [item_feature_name],
        LABEL_CHANNEL_NAME: [label_feature_name]
    }
    source_spec = {'train': train_mapping, 'test': test_mapping}
    feature_spec = FeatureSpec(feature_spec=feature_spec, metadata=metadata, source_spec=source_spec,
                               channel_spec=channel_spec, base_directory="")
    feature_spec.to_yaml(output_path=output_path)


def main():
    args = parse_args()

    if args.seed is not None:
        torch.manual_seed(args.seed)

    print("Loading raw data from {}".format(args.path))
    df = implicit_load(args.path, sort=False)

    print("Mapping original user and item IDs to new sequential IDs")
    df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
    df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]

    user_cardinality = df[USER_COLUMN].max() + 1
    item_cardinality = df[ITEM_COLUMN].max() + 1

    # Need to sort before popping to get last item
    df.sort_values(by='timestamp', inplace=True)

    # clean up data
    del df['rating'], df['timestamp']
    df = df.drop_duplicates()  # assuming it keeps order

    # Test set is the last interaction for a given user
    grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
    test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
    # Train set is all interactions but the last one
    train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

    sampler = _TestNegSampler(train_data.values, args.valid_negative)
    test_negs = sampler.generate().cuda()
    test_negs = test_negs.reshape(-1, args.valid_negative)

    # Reshape train set into user,item,label tabular and save
    train_ratings = torch.from_numpy(train_data.values).cuda()
    train_labels = torch.ones_like(train_ratings[:, 0:1], dtype=torch.float32)
    torch.save(train_ratings, os.path.join(args.output, TRAIN_0))
    torch.save(train_labels, os.path.join(args.output, TRAIN_1))

    # Reshape test set into user,item,label tabular and save
    # All users have the same number of items, items for a given user appear consecutively
    test_ratings = torch.from_numpy(test_data.values).cuda()
    test_users_pos = test_ratings[:, 0:1]  # slicing instead of indexing to keep dimensions
    test_items_pos = test_ratings[:, 1:2]
    test_users = test_users_pos.repeat_interleave(args.valid_negative + 1, dim=0)
    test_items = torch.cat((test_items_pos.reshape(-1, 1), test_negs), dim=1).reshape(-1, 1)
    positive_labels = torch.ones_like(test_users_pos, dtype=torch.float32)
    negative_labels = torch.zeros_like(test_users_pos, dtype=torch.float32).repeat(1, args.valid_negative)
    test_labels = torch.cat((positive_labels, negative_labels), dim=1).reshape(-1, 1)
    dtypes = {'user': str(test_users.dtype), 'item': str(test_items.dtype), 'label': str(test_labels.dtype)}
    test_tensor = torch.cat((test_users, test_items), dim=1)
    torch.save(test_tensor, os.path.join(args.output, TEST_0))
    torch.save(test_labels, os.path.join(args.output, TEST_1))

    save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes,
                      test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml')


if __name__ == '__main__':
    main()