AmirShabani
commited on
Commit
·
d19f826
1
Parent(s):
3adde17
Pretrained Model
Browse files- btree.pkl +3 -0
- core.py +177 -0
- final_movies.pkl +3 -0
- model.pth +3 -0
- movie_embeds.pkl +3 -0
- movie_mapping.pkl +3 -0
- requirements.txt +3 -9
- user_embeds.pkl +3 -0
- user_mapping.pkl +3 -0
- user_pos_items.pkl +3 -0
btree.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5177b0ca060d54769061e9a3c83b1439d0436c14baf02d293ca94ff3342fb645
|
3 |
+
size 231685
|
core.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import random
|
5 |
+
from tqdm import tqdm
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
import torch
|
9 |
+
from torch import nn, optim, Tensor
|
10 |
+
from torch_sparse import SparseTensor, matmul
|
11 |
+
from torch_geometric.utils import structured_negative_sampling
|
12 |
+
from torch_geometric.data import download_url, extract_zip
|
13 |
+
from torch_geometric.nn.conv.gcn_conv import gcn_norm
|
14 |
+
from torch_geometric.nn.conv import MessagePassing
|
15 |
+
from torch_geometric.typing import Adj
|
16 |
+
from sklearn.neighbors import BallTree
|
17 |
+
class LightGCN(MessagePassing):
|
18 |
+
def __init__(self, num_users, num_items, embedding_dim=64, diffusion_steps=3, add_self_loops=False):
|
19 |
+
super().__init__()
|
20 |
+
|
21 |
+
# Number of users and items in the graph
|
22 |
+
self.num_users = num_users
|
23 |
+
self.num_items = num_items
|
24 |
+
|
25 |
+
# Embedding dimension for user and item nodes
|
26 |
+
self.embedding_dim = embedding_dim
|
27 |
+
|
28 |
+
# Number of diffusion steps (K) for multi-scale diffusion
|
29 |
+
self.diffusion_steps = diffusion_steps
|
30 |
+
|
31 |
+
# Whether to add self-loops to the adjacency matrix
|
32 |
+
self.add_self_loops = add_self_loops
|
33 |
+
|
34 |
+
# Initialize embeddings for users and items (E^0)
|
35 |
+
self.users_emb = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_dim) # e_u^0
|
36 |
+
self.items_emb = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.embedding_dim) # e_i^0
|
37 |
+
|
38 |
+
# Initialize embedding weights with a normal distribution (mean=0, std=0.1)
|
39 |
+
nn.init.normal_(self.users_emb.weight, std=0.1)
|
40 |
+
nn.init.normal_(self.items_emb.weight, std=0.1)
|
41 |
+
|
42 |
+
def forward(self, edge_index: SparseTensor):
|
43 |
+
# Compute the symmetrically normalized adjacency matrix (A_hat or \tilde{A})
|
44 |
+
edge_index_norm = gcn_norm(edge_index, add_self_loops=self.add_self_loops)
|
45 |
+
|
46 |
+
# Get initial embeddings E^0 for all nodes (users and items)
|
47 |
+
emb_0 = torch.cat([self.users_emb.weight, self.items_emb.weight]) # E^0
|
48 |
+
|
49 |
+
# List to store embeddings at each diffusion step (E^1, E^2, ..., E^K)
|
50 |
+
embs = [emb_0]
|
51 |
+
|
52 |
+
# Initialize the current embeddings to E^0
|
53 |
+
emb_k = emb_0
|
54 |
+
|
55 |
+
# Perform multi-scale diffusion for K steps
|
56 |
+
for _ in range(self.diffusion_steps):
|
57 |
+
# Propagate embeddings and update emb_k using the normalized adjacency matrix
|
58 |
+
emb_k = self.propagate(edge_index_norm, x=emb_k)
|
59 |
+
# Save embeddings at each diffusion step for later use
|
60 |
+
embs.append(emb_k)
|
61 |
+
|
62 |
+
# Stack all the embeddings along the second dimension (stack E^0, E^1, ..., E^K)
|
63 |
+
embs = torch.stack(embs, dim=1)
|
64 |
+
|
65 |
+
# Calculate the final embeddings by taking the mean of all diffusion embeddings (E^K)
|
66 |
+
emb_final = torch.mean(embs, dim=1) # E^K
|
67 |
+
|
68 |
+
# Split the final embeddings into user embeddings (e_u^K) and item embeddings (e_i^K)
|
69 |
+
users_emb_final, items_emb_final = torch.split(emb_final, [self.num_users, self.num_items]) # Splits into e_u^K and e_i^K
|
70 |
+
|
71 |
+
# Returns the final embeddings for users (e_u^K), initial embeddings for users (e_u^0),
|
72 |
+
# final embeddings for items (e_i^K), and initial embeddings for items (e_i^0)
|
73 |
+
return users_emb_final, self.users_emb.weight, items_emb_final, self.items_emb.weight
|
74 |
+
|
75 |
+
def message(self, x_j: Tensor) -> Tensor:
|
76 |
+
# The message function is an identity function, i.e., it returns x_j itself
|
77 |
+
return x_j
|
78 |
+
|
79 |
+
def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
|
80 |
+
# Perform message passing and aggregation using the normalized adjacency matrix (A_hat or \tilde{A})
|
81 |
+
return matmul(adj_t, x)
|
82 |
+
|
83 |
+
|
84 |
+
model = LightGCN(671, 9125)
|
85 |
+
|
86 |
+
def get_movie_recommendations(user_id, num_recomms):
|
87 |
+
# Map the user ID to the corresponding index in the model's user embeddings
|
88 |
+
user_index = user_mapping[user_id]
|
89 |
+
|
90 |
+
# Retrieve the user embedding for the specified user
|
91 |
+
user_embedding = model.users_emb.weight[user_index]
|
92 |
+
|
93 |
+
# Calculate scores for all items using the user embedding
|
94 |
+
scores = model.items_emb.weight @ user_embedding
|
95 |
+
|
96 |
+
# Get the indices of the highest scores, including positive items and additional recommendations
|
97 |
+
values, indices = torch.topk(scores, k=len(user_pos_items[user_id]) + num_recomms)
|
98 |
+
|
99 |
+
# Retrieve the recommended movies that the user has already rated highly
|
100 |
+
rated_movies = [index.cpu().item() for index in indices if index in user_pos_items[user_id]][:num_recomms]
|
101 |
+
rated_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in rated_movies]
|
102 |
+
|
103 |
+
# Retrieve the suggested movies for the user that they have not rated
|
104 |
+
suggested_movies = [index.cpu().item() for index in indices if index not in user_pos_items[user_id]][:num_recomms]
|
105 |
+
suggested_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in suggested_movies]
|
106 |
+
|
107 |
+
return rated_movie_ids, suggested_movie_ids
|
108 |
+
|
109 |
+
addr = './'
|
110 |
+
|
111 |
+
model.load_state_dict(torch.load(addr + 'model.pth'))
|
112 |
+
|
113 |
+
final_movies_file = open(addr + 'final_movies.pkl', "rb")
|
114 |
+
final_movies = pickle.load(final_movies_file)
|
115 |
+
final_movies_file.close()
|
116 |
+
|
117 |
+
movie_embeds_file = open(addr + 'movie_embeds.pkl', "rb")
|
118 |
+
movie_embeds = pickle.load(movie_embeds_file)
|
119 |
+
movie_embeds_file.close()
|
120 |
+
|
121 |
+
btree_file = open(addr + 'btree.pkl', "rb")
|
122 |
+
btree = pickle.load(btree_file)
|
123 |
+
btree_file.close()
|
124 |
+
|
125 |
+
user_embeds_file = open(addr + 'user_embeds.pkl', "rb")
|
126 |
+
user_embeds = pickle.load(user_embeds_file)
|
127 |
+
user_embeds_file.close()
|
128 |
+
|
129 |
+
user_mapping_file = open(addr + 'user_mapping.pkl', "rb")
|
130 |
+
user_mapping = pickle.load(user_mapping_file)
|
131 |
+
user_mapping_file.close()
|
132 |
+
|
133 |
+
movie_mapping_file = open(addr + 'movie_mapping.pkl', "rb")
|
134 |
+
movie_mapping = pickle.load(movie_mapping_file)
|
135 |
+
movie_mapping_file.close()
|
136 |
+
|
137 |
+
user_pos_items_file = open(addr + 'user_pos_items.pkl', "rb")
|
138 |
+
user_pos_items = pickle.load(user_pos_items_file)
|
139 |
+
user_pos_items_file.close()
|
140 |
+
|
141 |
+
def create_user_embedding(movie_ratings, movies_df):
|
142 |
+
# Convert the movie_ratings dictionary to a dataframe
|
143 |
+
user_ratings_df = pd.DataFrame.from_dict(movie_ratings, orient='index', columns=['rating'])
|
144 |
+
user_ratings_df['movieId'] = user_ratings_df.index
|
145 |
+
|
146 |
+
# Merge the user_ratings_df with the movies_df to get the movie embeddings
|
147 |
+
user_movie_embeddings = user_ratings_df.merge(movies_df, on='movieId', how='left')
|
148 |
+
|
149 |
+
# Multiply the ratings with the movie embeddings
|
150 |
+
user_movie_embeddings = user_movie_embeddings.iloc[:, 2:].values * user_movie_embeddings['rating'].values[:, np.newaxis]
|
151 |
+
|
152 |
+
# Calculate the user embedding as the sum of the movie embeddings
|
153 |
+
user_embedding = np.sum(user_movie_embeddings, axis=0)
|
154 |
+
np.nan_to_num(user_embedding, 0)
|
155 |
+
print(user_movie_embeddings.shape)
|
156 |
+
return user_embedding
|
157 |
+
|
158 |
+
def find_closest_user(user_embedding, tree, user_embeddings):
|
159 |
+
# Query the BallTree to find the closest user to the given user_embedding
|
160 |
+
_, closest_user_index = tree.query([user_embedding], k=1)
|
161 |
+
|
162 |
+
# Get the embedding of the closest user
|
163 |
+
closest_user_embedding = user_embeddings.iloc[closest_user_index[0][0]]
|
164 |
+
|
165 |
+
return closest_user_embedding
|
166 |
+
|
167 |
+
def output_list(movie_ratings, movies_df = movie_embeds, tree = btree, user_embeddings = user_embeds, movies = final_movies):
|
168 |
+
user_embed = create_user_embedding(movie_ratings, movie_embeds)
|
169 |
+
# Call the find_closest_user function with the pre-built BallTree
|
170 |
+
closest_user_embed = find_closest_user(user_embed, tree, user_embeds)
|
171 |
+
rated_movie_ids, suggested_movie_ids = get_movie_recommendations(closest_user_embed['userId'], 5)
|
172 |
+
out1 = [movie_id for movie_id in set(rated_movie_ids + suggested_movie_ids) if movie_id not in movie_ratings.keys()]
|
173 |
+
out2 = [movies['title'][idx] for idx in out1]
|
174 |
+
return out2
|
175 |
+
|
176 |
+
output_list({1:1,2:2,3:3,4:4,5:5})
|
177 |
+
|
final_movies.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86e0ee2e8f16685d0a90fba498be5aa8ff2671f16b10dc93d9f155054ce88322
|
3 |
+
size 405680
|
model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d9a65bbec8f2902af2d4d5ad39f850e337e907fa6a147734590f31f501be600
|
3 |
+
size 2508831
|
movie_embeds.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f2a91da98434bb5c2f7b8a9471b94f932312a03fe823dfdad2bb2c14569e8a5
|
3 |
+
size 2773508
|
movie_mapping.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:75ed189de06d7bbcaccedf3039b057a5e6a38962091da1971c88423a4cffb7ab
|
3 |
+
size 58262
|
requirements.txt
CHANGED
@@ -1,9 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
torch_sparse
|
4 |
-
category_encoders
|
5 |
-
tensorflow_hub
|
6 |
-
tensorflow
|
7 |
-
sklearn.model_selection
|
8 |
-
ast
|
9 |
-
re
|
|
|
1 |
+
torch_geometric
|
2 |
+
torch_scatter
|
3 |
+
torch_sparse
|
|
|
|
|
|
|
|
|
|
|
|
user_embeds.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1749f3230af027e900ff131dcbd65c31fafdb6042d2f8ad5b46aed80ba8182c5
|
3 |
+
size 232012
|
user_mapping.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9aee38b17971f5f012c2104d92c93e6c2da735711480689f30b0667a44892fbc
|
3 |
+
size 3531
|
user_pos_items.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02ecefe1ee185fb6624eadfd3bdde63327c7a806d77e9843fe06499c554105f9
|
3 |
+
size 155034
|