load.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. # Copyright (c) 2018, deepakn94. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from collections import namedtuple
  15. import pandas as pd
  16. RatingData = namedtuple('RatingData',
  17. ['items', 'users', 'ratings', 'min_date', 'max_date'])
  18. def describe_ratings(ratings):
  19. info = RatingData(items=len(ratings['item_id'].unique()),
  20. users=len(ratings['user_id'].unique()),
  21. ratings=len(ratings),
  22. min_date=ratings['timestamp'].min(),
  23. max_date=ratings['timestamp'].max())
  24. print("{ratings} ratings on {items} items from {users} users"
  25. " from {min_date} to {max_date}"
  26. .format(**(info._asdict())))
  27. return info
  28. def process_movielens(ratings, sort=True):
  29. ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
  30. if sort:
  31. ratings.sort_values(by='timestamp', inplace=True)
  32. describe_ratings(ratings)
  33. return ratings
  34. def load_ml_100k(filename, sort=True):
  35. names = ['user_id', 'item_id', 'rating', 'timestamp']
  36. ratings = pd.read_csv(filename, sep='\t', names=names)
  37. return process_movielens(ratings, sort=sort)
  38. def load_ml_1m(filename, sort=True):
  39. names = ['user_id', 'item_id', 'rating', 'timestamp']
  40. ratings = pd.read_csv(filename, sep='::', names=names, engine='python')
  41. return process_movielens(ratings, sort=sort)
  42. def load_ml_10m(filename, sort=True):
  43. names = ['user_id', 'item_id', 'rating', 'timestamp']
  44. ratings = pd.read_csv(filename, sep='::', names=names, engine='python')
  45. return process_movielens(ratings, sort=sort)
  46. def load_ml_20m(filename, sort=True):
  47. ratings = pd.read_csv(filename)
  48. ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
  49. names = {'userId': 'user_id', 'movieId': 'item_id'}
  50. ratings.rename(columns=names, inplace=True)
  51. return process_movielens(ratings, sort=sort)
  52. DATASETS = [k.replace('load_', '') for k in locals().keys() if "load_" in k]
  53. def get_dataset_name(filename):
  54. for dataset in DATASETS:
  55. if dataset in filename.replace('-', '_').lower():
  56. return dataset
  57. raise NotImplementedError
  58. def implicit_load(filename, sort=True):
  59. func = globals()["load_" + get_dataset_name(filename)]
  60. return func(filename, sort=sort)