preprocessing.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. from collections import defaultdict
  16. from glob import glob
  17. import pandas as pd
  18. from scipy import sparse
  19. import scipy.sparse as sp
  20. import numpy as np
  21. from scipy.sparse import load_npz, csr_matrix
  22. import logging
  23. import json
  24. LOG = logging.getLogger("VAE")
  25. def save_as_npz(m_sp, path):
  26. if not os.path.isdir(os.path.dirname(path)):
  27. os.makedirs(os.path.dirname(path))
  28. sp.save_npz(path, m_sp)
  29. def get_count(tp, id):
  30. playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
  31. count = playcount_groupbyid.size()
  32. return count
  33. def filter_triplets(tp, min_uc=5, min_sc=0):
  34. # Only keep the triplets for items which were clicked on by at least min_sc users.
  35. if min_sc > 0:
  36. itemcount = get_count(tp, 'movieId')
  37. tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
  38. # Only keep the triplets for users who clicked on at least min_uc items
  39. # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
  40. if min_uc > 0:
  41. usercount = get_count(tp, 'userId')
  42. tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
  43. # Update both usercount and itemcount after filtering
  44. usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId')
  45. return tp, usercount, itemcount
  46. def save_id_mappings(cache_dir, show2id, profile2id):
  47. if not os.path.isdir(cache_dir):
  48. os.makedirs(cache_dir)
  49. for d, filename in [(show2id, 'show2id.json'),
  50. (profile2id, 'profile2id.json')]:
  51. with open(os.path.join(cache_dir, filename), 'w') as f:
  52. d = {str(k): v for k, v in d.items()}
  53. json.dump(d, f, indent=4)
  54. def load_and_parse_ML_20M(data_dir, threshold=4, parse=True):
  55. """
  56. Original way of processing ml-20m dataset from VAE for CF paper
  57. Copyright [2018] [Dawen Liang, Rahul G. Krishnan, Matthew D. Hoffman, and Tony Jebara]
  58. SPDX-License-Identifier: Apache-2.0
  59. Modifications copyright (C) 2019 Michał Filipiuk, Albert Cieślak, Frederic Grabowski, Radosław Rowicki
  60. """
  61. cache_dir = os.path.join(data_dir, "ml-20m/preprocessed")
  62. train_data_file = os.path.join(cache_dir, "train_data.npz")
  63. vad_data_true_file = os.path.join(cache_dir, "vad_data_true.npz")
  64. vad_data_test_file = os.path.join(cache_dir, "vad_data_test.npz")
  65. test_data_true_file = os.path.join(cache_dir, "test_data_true.npz")
  66. test_data_test_file = os.path.join(cache_dir, "test_data_test.npz")
  67. if (os.path.isfile(train_data_file)
  68. and os.path.isfile(vad_data_true_file)
  69. and os.path.isfile(vad_data_test_file)
  70. and os.path.isfile(test_data_true_file)
  71. and os.path.isfile(test_data_test_file)):
  72. LOG.info("Already processed, skipping.")
  73. return load_npz(train_data_file), \
  74. load_npz(vad_data_true_file), \
  75. load_npz(vad_data_test_file), \
  76. load_npz(test_data_true_file), \
  77. load_npz(test_data_test_file),
  78. if not parse:
  79. raise ValueError('Dataset not preprocessed. Please run python3 prepare_dataset.py first.')
  80. LOG.info("Parsing movielens.")
  81. source_file = os.path.join(data_dir, "ml-20m/extracted/ml-20m", "ratings.csv")
  82. if not glob(source_file):
  83. raise ValueError('Dataset not downloaded. Please download the ML-20m dataset from https://grouplens.org/datasets/movielens/20m/, unzip it and put it in ', source_file)
  84. raw_data = pd.read_csv(source_file)
  85. raw_data.drop('timestamp', axis=1, inplace=True)
  86. raw_data = raw_data[raw_data['rating'] >= threshold]
  87. raw_data, user_activity, item_popularity = filter_triplets(raw_data)
  88. unique_uid = user_activity.index
  89. idx_perm = np.random.permutation(unique_uid.size)
  90. unique_uid = unique_uid[idx_perm]
  91. n_users = unique_uid.size
  92. n_heldout_users = 10000
  93. true_users = unique_uid[:(n_users - n_heldout_users * 2)]
  94. vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
  95. test_users = unique_uid[(n_users - n_heldout_users):]
  96. train_plays = raw_data.loc[raw_data['userId'].isin(true_users)]
  97. unique_sid = pd.unique(train_plays['movieId'])
  98. show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
  99. profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))
  100. save_id_mappings(cache_dir, show2id, profile2id)
  101. def split_train_test_proportion(data, test_prop=0.2):
  102. data_grouped_by_user = data.groupby('userId')
  103. true_list, test_list = list(), list()
  104. for i, (_, group) in enumerate(data_grouped_by_user):
  105. n_items_u = len(group)
  106. if n_items_u >= 5:
  107. idx = np.zeros(n_items_u, dtype='bool')
  108. idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True
  109. true_list.append(group[np.logical_not(idx)])
  110. test_list.append(group[idx])
  111. else:
  112. true_list.append(group)
  113. data_true = pd.concat(true_list)
  114. data_test = pd.concat(test_list)
  115. return data_true, data_test
  116. vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
  117. vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]
  118. vad_plays_true, vad_plays_test = split_train_test_proportion(vad_plays)
  119. test_plays = raw_data.loc[raw_data['userId'].isin(test_users)]
  120. test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]
  121. test_plays_true, test_plays_test = split_train_test_proportion(test_plays)
  122. def numerize(tp):
  123. uid = tp['userId'].map(lambda x: profile2id[x])
  124. sid = tp['movieId'].map(lambda x: show2id[x])
  125. return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])
  126. train_data = numerize(train_plays)
  127. vad_data_true = numerize(vad_plays_true)
  128. vad_data_test = numerize(vad_plays_test)
  129. test_data_true = numerize(test_plays_true)
  130. test_data_test = numerize(test_plays_test)
  131. n_items = len(unique_sid)
  132. def load_train_data(tp):
  133. n_users = tp['uid'].max() + 1
  134. rows, cols = tp['uid'], tp['sid']
  135. data = sparse.csr_matrix((np.ones_like(rows),
  136. (rows, cols)), dtype='float64',
  137. shape=(n_users, n_items))
  138. return data
  139. train_data = load_train_data(train_data)
  140. def load_true_test_data(tp_true, tp_test):
  141. start_idx = min(tp_true['uid'].min(), tp_test['uid'].min())
  142. end_idx = max(tp_true['uid'].max(), tp_test['uid'].max())
  143. rows_true, cols_true = tp_true['uid'] - start_idx, tp_true['sid']
  144. rows_test, cols_test = tp_test['uid'] - start_idx, tp_test['sid']
  145. data_true = sparse.csr_matrix((np.ones_like(rows_true),
  146. (rows_true, cols_true)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
  147. data_test = sparse.csr_matrix((np.ones_like(rows_test),
  148. (rows_test, cols_test)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
  149. return data_true, data_test
  150. vad_data_true, vad_data_test = load_true_test_data(vad_data_true, vad_data_test)
  151. test_data_true, test_data_test = load_true_test_data(test_data_true, test_data_test)
  152. save_as_npz(train_data, train_data_file)
  153. save_as_npz(vad_data_true, vad_data_true_file)
  154. save_as_npz(vad_data_test, vad_data_test_file)
  155. save_as_npz(test_data_true, test_data_true_file)
  156. save_as_npz(test_data_test, test_data_test_file)
  157. return train_data, vad_data_true, vad_data_test, test_data_true, test_data_test
  158. def filter_data(data, min_users=1, min_items=5):
  159. """
  160. :param data: input matrix
  161. :param min_users: only keep items, that were clicked by at least min_users
  162. :param min_items: only keep users, that clicked at least min_items
  163. :return: filtered matrix
  164. """
  165. col_count = defaultdict(lambda: 0)
  166. for col in data.nonzero()[1]:
  167. col_count[col] += 1
  168. filtered_col = [k for k, v in col_count.items() if v >= min_users]
  169. filtered_data_c = data[:, filtered_col]
  170. del data
  171. row_count = defaultdict(lambda: 0)
  172. for row in filtered_data_c.nonzero()[0]:
  173. row_count[row] += 1
  174. filtered_row = [k for k, v in row_count.items() if v >= min_items]
  175. filtered_data_r = filtered_data_c[filtered_row, :]
  176. del filtered_data_c
  177. return filtered_data_r
  178. def split_into_train_val_test(data, val_ratio, test_ratio):
  179. """
  180. :param data: input matrix
  181. :param val_ratio: Ratio of validation users to all users
  182. :param test_ratio: Ratio of test users to all users
  183. :return: Tuple of 3 matrices : {train_matrix, val_matrix, test_matrix}
  184. """
  185. assert val_ratio + test_ratio < 1
  186. train_ratio = 1 - val_ratio - test_ratio
  187. rows_count = data.shape[0]
  188. idx = np.random.permutation(range(rows_count))
  189. train_users_count = int(np.rint(rows_count * train_ratio))
  190. val_users_count = int(np.rint(rows_count * val_ratio))
  191. seperator = train_users_count + val_users_count
  192. train_matrix = data[idx[:train_users_count]]
  193. val_matrix = data[idx[train_users_count:seperator]]
  194. test_matrix = data[idx[seperator:]]
  195. return train_matrix, val_matrix, test_matrix
  196. def split_movies_into_train_test(data, train_ratio):
  197. """
  198. Splits data into 2 matrices. The users stay the same, but the items are being split by train_ratio
  199. :param data: input matrix
  200. :param train_ratio: Ratio of input items to all items
  201. :return: tuple of 2 matrices: {train_matrix, test_matrix}
  202. """
  203. rows_count, columns_count = data.shape
  204. train_rows = list()
  205. train_columns = list()
  206. test_rows = list()
  207. test_columns = list()
  208. for i in range(rows_count):
  209. user_movies = data.getrow(i).nonzero()[1]
  210. np.random.shuffle(user_movies)
  211. movies_count = len(user_movies)
  212. train_count = int(np.floor(movies_count * train_ratio))
  213. test_count = movies_count - train_count
  214. train_movies = user_movies[:train_count]
  215. test_movies = user_movies[train_count:]
  216. train_rows += ([i] * train_count)
  217. train_columns += list(train_movies)
  218. test_rows += ([i] * test_count)
  219. test_columns += list(test_movies)
  220. train_matrix = csr_matrix(([1] * len(train_rows), (train_rows, train_columns)), shape=(rows_count, columns_count))
  221. test_matrix = csr_matrix(([1] * len(test_rows), (test_rows, test_columns)), shape=(rows_count, columns_count))
  222. return train_matrix, test_matrix
  223. def remove_items_that_doesnt_occure_in_train(train_matrix, val_matrix, test_matrix):
  224. """
  225. Remove items that don't occure in train matrix
  226. :param train_matrix: training data
  227. :param val_matrix: validation data
  228. :param test_matrix: test data
  229. :return: Input matrices without some items
  230. """
  231. item_occure = defaultdict(lambda: False)
  232. for col in train_matrix.nonzero()[1]:
  233. item_occure[col] = True
  234. non_empty_items = [k for k, v in item_occure.items() if v == True]
  235. return train_matrix[:, non_empty_items], val_matrix[:, non_empty_items], test_matrix[:, non_empty_items]