AmirShabani commited on
Commit
d19f826
·
1 Parent(s): 3adde17

Pretrained Model

Browse files
btree.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5177b0ca060d54769061e9a3c83b1439d0436c14baf02d293ca94ff3342fb645
3
+ size 231685
core.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import numpy as np
3
+ import pandas as pd
4
+ import random
5
+ from tqdm import tqdm
6
+ import matplotlib.pyplot as plt
7
+ from sklearn.model_selection import train_test_split
8
+ import torch
9
+ from torch import nn, optim, Tensor
10
+ from torch_sparse import SparseTensor, matmul
11
+ from torch_geometric.utils import structured_negative_sampling
12
+ from torch_geometric.data import download_url, extract_zip
13
+ from torch_geometric.nn.conv.gcn_conv import gcn_norm
14
+ from torch_geometric.nn.conv import MessagePassing
15
+ from torch_geometric.typing import Adj
16
+ from sklearn.neighbors import BallTree
17
+ class LightGCN(MessagePassing):
18
+ def __init__(self, num_users, num_items, embedding_dim=64, diffusion_steps=3, add_self_loops=False):
19
+ super().__init__()
20
+
21
+ # Number of users and items in the graph
22
+ self.num_users = num_users
23
+ self.num_items = num_items
24
+
25
+ # Embedding dimension for user and item nodes
26
+ self.embedding_dim = embedding_dim
27
+
28
+ # Number of diffusion steps (K) for multi-scale diffusion
29
+ self.diffusion_steps = diffusion_steps
30
+
31
+ # Whether to add self-loops to the adjacency matrix
32
+ self.add_self_loops = add_self_loops
33
+
34
+ # Initialize embeddings for users and items (E^0)
35
+ self.users_emb = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_dim) # e_u^0
36
+ self.items_emb = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.embedding_dim) # e_i^0
37
+
38
+ # Initialize embedding weights with a normal distribution (mean=0, std=0.1)
39
+ nn.init.normal_(self.users_emb.weight, std=0.1)
40
+ nn.init.normal_(self.items_emb.weight, std=0.1)
41
+
42
+ def forward(self, edge_index: SparseTensor):
43
+ # Compute the symmetrically normalized adjacency matrix (A_hat or \tilde{A})
44
+ edge_index_norm = gcn_norm(edge_index, add_self_loops=self.add_self_loops)
45
+
46
+ # Get initial embeddings E^0 for all nodes (users and items)
47
+ emb_0 = torch.cat([self.users_emb.weight, self.items_emb.weight]) # E^0
48
+
49
+ # List to store embeddings at each diffusion step (E^1, E^2, ..., E^K)
50
+ embs = [emb_0]
51
+
52
+ # Initialize the current embeddings to E^0
53
+ emb_k = emb_0
54
+
55
+ # Perform multi-scale diffusion for K steps
56
+ for _ in range(self.diffusion_steps):
57
+ # Propagate embeddings and update emb_k using the normalized adjacency matrix
58
+ emb_k = self.propagate(edge_index_norm, x=emb_k)
59
+ # Save embeddings at each diffusion step for later use
60
+ embs.append(emb_k)
61
+
62
+ # Stack all the embeddings along the second dimension (stack E^0, E^1, ..., E^K)
63
+ embs = torch.stack(embs, dim=1)
64
+
65
+ # Calculate the final embeddings by taking the mean of all diffusion embeddings (E^K)
66
+ emb_final = torch.mean(embs, dim=1) # E^K
67
+
68
+ # Split the final embeddings into user embeddings (e_u^K) and item embeddings (e_i^K)
69
+ users_emb_final, items_emb_final = torch.split(emb_final, [self.num_users, self.num_items]) # Splits into e_u^K and e_i^K
70
+
71
+ # Returns the final embeddings for users (e_u^K), initial embeddings for users (e_u^0),
72
+ # final embeddings for items (e_i^K), and initial embeddings for items (e_i^0)
73
+ return users_emb_final, self.users_emb.weight, items_emb_final, self.items_emb.weight
74
+
75
+ def message(self, x_j: Tensor) -> Tensor:
76
+ # The message function is an identity function, i.e., it returns x_j itself
77
+ return x_j
78
+
79
+ def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
80
+ # Perform message passing and aggregation using the normalized adjacency matrix (A_hat or \tilde{A})
81
+ return matmul(adj_t, x)
82
+
83
+
84
+ model = LightGCN(671, 9125)
85
+
86
+ def get_movie_recommendations(user_id, num_recomms):
87
+ # Map the user ID to the corresponding index in the model's user embeddings
88
+ user_index = user_mapping[user_id]
89
+
90
+ # Retrieve the user embedding for the specified user
91
+ user_embedding = model.users_emb.weight[user_index]
92
+
93
+ # Calculate scores for all items using the user embedding
94
+ scores = model.items_emb.weight @ user_embedding
95
+
96
+ # Get the indices of the highest scores, including positive items and additional recommendations
97
+ values, indices = torch.topk(scores, k=len(user_pos_items[user_id]) + num_recomms)
98
+
99
+ # Retrieve the recommended movies that the user has already rated highly
100
+ rated_movies = [index.cpu().item() for index in indices if index in user_pos_items[user_id]][:num_recomms]
101
+ rated_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in rated_movies]
102
+
103
+ # Retrieve the suggested movies for the user that they have not rated
104
+ suggested_movies = [index.cpu().item() for index in indices if index not in user_pos_items[user_id]][:num_recomms]
105
+ suggested_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in suggested_movies]
106
+
107
+ return rated_movie_ids, suggested_movie_ids
108
+
109
+ addr = './'
110
+
111
+ model.load_state_dict(torch.load(addr + 'model.pth'))
112
+
113
+ final_movies_file = open(addr + 'final_movies.pkl', "rb")
114
+ final_movies = pickle.load(final_movies_file)
115
+ final_movies_file.close()
116
+
117
+ movie_embeds_file = open(addr + 'movie_embeds.pkl', "rb")
118
+ movie_embeds = pickle.load(movie_embeds_file)
119
+ movie_embeds_file.close()
120
+
121
+ btree_file = open(addr + 'btree.pkl', "rb")
122
+ btree = pickle.load(btree_file)
123
+ btree_file.close()
124
+
125
+ user_embeds_file = open(addr + 'user_embeds.pkl', "rb")
126
+ user_embeds = pickle.load(user_embeds_file)
127
+ user_embeds_file.close()
128
+
129
+ user_mapping_file = open(addr + 'user_mapping.pkl', "rb")
130
+ user_mapping = pickle.load(user_mapping_file)
131
+ user_mapping_file.close()
132
+
133
+ movie_mapping_file = open(addr + 'movie_mapping.pkl', "rb")
134
+ movie_mapping = pickle.load(movie_mapping_file)
135
+ movie_mapping_file.close()
136
+
137
+ user_pos_items_file = open(addr + 'user_pos_items.pkl', "rb")
138
+ user_pos_items = pickle.load(user_pos_items_file)
139
+ user_pos_items_file.close()
140
+
141
+ def create_user_embedding(movie_ratings, movies_df):
142
+ # Convert the movie_ratings dictionary to a dataframe
143
+ user_ratings_df = pd.DataFrame.from_dict(movie_ratings, orient='index', columns=['rating'])
144
+ user_ratings_df['movieId'] = user_ratings_df.index
145
+
146
+ # Merge the user_ratings_df with the movies_df to get the movie embeddings
147
+ user_movie_embeddings = user_ratings_df.merge(movies_df, on='movieId', how='left')
148
+
149
+ # Multiply the ratings with the movie embeddings
150
+ user_movie_embeddings = user_movie_embeddings.iloc[:, 2:].values * user_movie_embeddings['rating'].values[:, np.newaxis]
151
+
152
+ # Calculate the user embedding as the sum of the movie embeddings
153
+ user_embedding = np.sum(user_movie_embeddings, axis=0)
154
+ np.nan_to_num(user_embedding, 0)
155
+ print(user_movie_embeddings.shape)
156
+ return user_embedding
157
+
158
+ def find_closest_user(user_embedding, tree, user_embeddings):
159
+ # Query the BallTree to find the closest user to the given user_embedding
160
+ _, closest_user_index = tree.query([user_embedding], k=1)
161
+
162
+ # Get the embedding of the closest user
163
+ closest_user_embedding = user_embeddings.iloc[closest_user_index[0][0]]
164
+
165
+ return closest_user_embedding
166
+
167
+ def output_list(movie_ratings, movies_df = movie_embeds, tree = btree, user_embeddings = user_embeds, movies = final_movies):
168
+ user_embed = create_user_embedding(movie_ratings, movie_embeds)
169
+ # Call the find_closest_user function with the pre-built BallTree
170
+ closest_user_embed = find_closest_user(user_embed, tree, user_embeds)
171
+ rated_movie_ids, suggested_movie_ids = get_movie_recommendations(closest_user_embed['userId'], 5)
172
+ out1 = [movie_id for movie_id in set(rated_movie_ids + suggested_movie_ids) if movie_id not in movie_ratings.keys()]
173
+ out2 = [movies['title'][idx] for idx in out1]
174
+ return out2
175
+
176
+ output_list({1:1,2:2,3:3,4:4,5:5})
177
+
final_movies.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86e0ee2e8f16685d0a90fba498be5aa8ff2671f16b10dc93d9f155054ce88322
3
+ size 405680
model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d9a65bbec8f2902af2d4d5ad39f850e337e907fa6a147734590f31f501be600
3
+ size 2508831
movie_embeds.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f2a91da98434bb5c2f7b8a9471b94f932312a03fe823dfdad2bb2c14569e8a5
3
+ size 2773508
movie_mapping.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75ed189de06d7bbcaccedf3039b057a5e6a38962091da1971c88423a4cffb7ab
3
+ size 58262
requirements.txt CHANGED
@@ -1,9 +1,3 @@
1
- torch
2
- torch_geometric torch_scatter
3
- torch_sparse
4
- category_encoders
5
- tensorflow_hub
6
- tensorflow
7
- sklearn.model_selection
8
- ast
9
- re
 
1
+ torch_geometric
2
+ torch_scatter
3
+ torch_sparse
 
 
 
 
 
 
user_embeds.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1749f3230af027e900ff131dcbd65c31fafdb6042d2f8ad5b46aed80ba8182c5
3
+ size 232012
user_mapping.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aee38b17971f5f012c2104d92c93e6c2da735711480689f30b0667a44892fbc
3
+ size 3531
user_pos_items.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02ecefe1ee185fb6624eadfd3bdde63327c7a806d77e9843fe06499c554105f9
3
+ size 155034