convert.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. # Copyright (c) 2018, deepakn94, codyaustun, robieta. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # -----------------------------------------------------------------------
  16. #
  17. # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  18. #
  19. # Licensed under the Apache License, Version 2.0 (the "License");
  20. # you may not use this file except in compliance with the License.
  21. # You may obtain a copy of the License at
  22. #
  23. # http://www.apache.org/licenses/LICENSE-2.0
  24. #
  25. # Unless required by applicable law or agreed to in writing, software
  26. # distributed under the License is distributed on an "AS IS" BASIS,
  27. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  28. # See the License for the specific language governing permissions and
  29. # limitations under the License.
  30. from argparse import ArgumentParser
  31. import pandas as pd
  32. from load import implicit_load
  33. from feature_spec import FeatureSpec
  34. from neumf_constants import USER_CHANNEL_NAME, ITEM_CHANNEL_NAME, LABEL_CHANNEL_NAME, TEST_SAMPLES_PER_SERIES
  35. import torch
  36. import os
  37. import tqdm
  38. TEST_1 = 'test_data_1.pt'
  39. TEST_0 = 'test_data_0.pt'
  40. TRAIN_1 = 'train_data_1.pt'
  41. TRAIN_0 = 'train_data_0.pt'
  42. USER_COLUMN = 'user_id'
  43. ITEM_COLUMN = 'item_id'
  44. def parse_args():
  45. parser = ArgumentParser()
  46. parser.add_argument('--path', type=str, default='/data/ml-20m/ratings.csv',
  47. help='Path to reviews CSV file from MovieLens')
  48. parser.add_argument('--output', type=str, default='/data',
  49. help='Output directory for train and test files')
  50. parser.add_argument('--valid_negative', type=int, default=100,
  51. help='Number of negative samples for each positive test example')
  52. parser.add_argument('--seed', '-s', type=int, default=1,
  53. help='Manually set random seed for torch')
  54. return parser.parse_args()
  55. class _TestNegSampler:
  56. def __init__(self, train_ratings, nb_neg):
  57. self.nb_neg = nb_neg
  58. self.nb_users = int(train_ratings[:, 0].max()) + 1
  59. self.nb_items = int(train_ratings[:, 1].max()) + 1
  60. # compute unique ids for quickly created hash set and fast lookup
  61. ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
  62. self.set = set(ids)
  63. def generate(self, batch_size=128 * 1024):
  64. users = torch.arange(0, self.nb_users).reshape([1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)
  65. items = [-1] * len(users)
  66. random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
  67. print('Generating validation negatives...')
  68. for idx, u in enumerate(tqdm.tqdm(users.tolist())):
  69. if not random_items:
  70. random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
  71. j = random_items.pop()
  72. while u * self.nb_items + j in self.set:
  73. if not random_items:
  74. random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
  75. j = random_items.pop()
  76. items[idx] = j
  77. items = torch.LongTensor(items)
  78. return items
  79. def save_feature_spec(user_cardinality, item_cardinality, dtypes, test_negative_samples, output_path,
  80. user_feature_name='user',
  81. item_feature_name='item',
  82. label_feature_name='label'):
  83. feature_spec = {
  84. user_feature_name: {
  85. 'dtype': dtypes[user_feature_name],
  86. 'cardinality': int(user_cardinality)
  87. },
  88. item_feature_name: {
  89. 'dtype': dtypes[item_feature_name],
  90. 'cardinality': int(item_cardinality)
  91. },
  92. label_feature_name: {
  93. 'dtype': dtypes[label_feature_name],
  94. }
  95. }
  96. metadata = {
  97. TEST_SAMPLES_PER_SERIES: test_negative_samples + 1
  98. }
  99. train_mapping = [
  100. {
  101. 'type': 'torch_tensor',
  102. 'features': [
  103. user_feature_name,
  104. item_feature_name
  105. ],
  106. 'files': [TRAIN_0]
  107. },
  108. {
  109. 'type': 'torch_tensor',
  110. 'features': [
  111. label_feature_name
  112. ],
  113. 'files': [TRAIN_1]
  114. }
  115. ]
  116. test_mapping = [
  117. {
  118. 'type': 'torch_tensor',
  119. 'features': [
  120. user_feature_name,
  121. item_feature_name
  122. ],
  123. 'files': [TEST_0],
  124. },
  125. {
  126. 'type': 'torch_tensor',
  127. 'features': [
  128. label_feature_name
  129. ],
  130. 'files': [TEST_1],
  131. }
  132. ]
  133. channel_spec = {
  134. USER_CHANNEL_NAME: [user_feature_name],
  135. ITEM_CHANNEL_NAME: [item_feature_name],
  136. LABEL_CHANNEL_NAME: [label_feature_name]
  137. }
  138. source_spec = {'train': train_mapping, 'test': test_mapping}
  139. feature_spec = FeatureSpec(feature_spec=feature_spec, metadata=metadata, source_spec=source_spec,
  140. channel_spec=channel_spec, base_directory="")
  141. feature_spec.to_yaml(output_path=output_path)
  142. def main():
  143. args = parse_args()
  144. if args.seed is not None:
  145. torch.manual_seed(args.seed)
  146. print("Loading raw data from {}".format(args.path))
  147. df = implicit_load(args.path, sort=False)
  148. print("Mapping original user and item IDs to new sequential IDs")
  149. df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
  150. df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]
  151. user_cardinality = df[USER_COLUMN].max() + 1
  152. item_cardinality = df[ITEM_COLUMN].max() + 1
  153. # Need to sort before popping to get last item
  154. df.sort_values(by='timestamp', inplace=True)
  155. # clean up data
  156. del df['rating'], df['timestamp']
  157. df = df.drop_duplicates() # assuming it keeps order
  158. # Test set is the last interaction for a given user
  159. grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
  160. test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
  161. # Train set is all interactions but the last one
  162. train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])
  163. sampler = _TestNegSampler(train_data.values, args.valid_negative)
  164. test_negs = sampler.generate().cuda()
  165. test_negs = test_negs.reshape(-1, args.valid_negative)
  166. # Reshape train set into user,item,label tabular and save
  167. train_ratings = torch.from_numpy(train_data.values).cuda()
  168. train_labels = torch.ones_like(train_ratings[:, 0:1], dtype=torch.float32)
  169. torch.save(train_ratings, os.path.join(args.output, TRAIN_0))
  170. torch.save(train_labels, os.path.join(args.output, TRAIN_1))
  171. # Reshape test set into user,item,label tabular and save
  172. # All users have the same number of items, items for a given user appear consecutively
  173. test_ratings = torch.from_numpy(test_data.values).cuda()
  174. test_users_pos = test_ratings[:, 0:1] # slicing instead of indexing to keep dimensions
  175. test_items_pos = test_ratings[:, 1:2]
  176. test_users = test_users_pos.repeat_interleave(args.valid_negative + 1, dim=0)
  177. test_items = torch.cat((test_items_pos.reshape(-1, 1), test_negs), dim=1).reshape(-1, 1)
  178. positive_labels = torch.ones_like(test_users_pos, dtype=torch.float32)
  179. negative_labels = torch.zeros_like(test_users_pos, dtype=torch.float32).repeat(1, args.valid_negative)
  180. test_labels = torch.cat((positive_labels, negative_labels), dim=1).reshape(-1, 1)
  181. dtypes = {'user': str(test_users.dtype), 'item': str(test_items.dtype), 'label': str(test_labels.dtype)}
  182. test_tensor = torch.cat((test_users, test_items), dim=1)
  183. torch.save(test_tensor, os.path.join(args.output, TEST_0))
  184. torch.save(test_labels, os.path.join(args.output, TEST_1))
  185. save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes,
  186. test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml')
  187. if __name__ == '__main__':
  188. main()