| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- # Copyright (c) 2018, deepakn94. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- from collections import namedtuple
- import pandas as pd
- RatingData = namedtuple('RatingData',
- ['items', 'users', 'ratings', 'min_date', 'max_date'])
- def describe_ratings(ratings):
- info = RatingData(items=len(ratings['item_id'].unique()),
- users=len(ratings['user_id'].unique()),
- ratings=len(ratings),
- min_date=ratings['timestamp'].min(),
- max_date=ratings['timestamp'].max())
- print("{ratings} ratings on {items} items from {users} users"
- " from {min_date} to {max_date}"
- .format(**(info._asdict())))
- return info
- def process_movielens(ratings, sort=True):
- ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
- if sort:
- ratings.sort_values(by='timestamp', inplace=True)
- describe_ratings(ratings)
- return ratings
- def load_ml_100k(filename, sort=True):
- names = ['user_id', 'item_id', 'rating', 'timestamp']
- ratings = pd.read_csv(filename, sep='\t', names=names)
- return process_movielens(ratings, sort=sort)
- def load_ml_1m(filename, sort=True):
- names = ['user_id', 'item_id', 'rating', 'timestamp']
- ratings = pd.read_csv(filename, sep='::', names=names, engine='python')
- return process_movielens(ratings, sort=sort)
- def load_ml_10m(filename, sort=True):
- names = ['user_id', 'item_id', 'rating', 'timestamp']
- ratings = pd.read_csv(filename, sep='::', names=names, engine='python')
- return process_movielens(ratings, sort=sort)
- def load_ml_20m(filename, sort=True):
- ratings = pd.read_csv(filename)
- ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
- names = {'userId': 'user_id', 'movieId': 'item_id'}
- ratings.rename(columns=names, inplace=True)
- return process_movielens(ratings, sort=sort)
- DATASETS = [k.replace('load_', '') for k in locals().keys() if "load_" in k]
- def get_dataset_name(filename):
- for dataset in DATASETS:
- if dataset in filename.replace('-', '_').lower():
- return dataset
- raise NotImplementedError
- def implicit_load(filename, sort=True):
- func = globals()["load_" + get_dataset_name(filename)]
- return func(filename, sort=sort)
|