File size: 8,801 Bytes
366ed46 eb556bd 366ed46 415bdb0 366ed46 415bdb0 eb556bd d19f826 a3e15e0 d19f826 415bdb0 db0d1d3 d19f826 415bdb0 325a5ae 42428e3 a3e15e0 225ee02 d19f826 5261ce7 d19f826 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import pip
def install(package):
if hasattr(pip, 'main'):
pip.main(['install', package])
else:
pip._internal.main(['install', package])
print("Everything goes bang.")
install('torch_geometric')
install('torch_scatter')
install('torch_sparse')
print("It's havoc baby!")
import pickle
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
from torch import nn, optim, Tensor
from torch_sparse import SparseTensor, matmul
from torch_geometric.utils import structured_negative_sampling
from torch_geometric.data import download_url, extract_zip
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import Adj
from sklearn.neighbors import BallTree
from thefuzz import fuzz
from thefuzz import process
class LightGCN(MessagePassing):
def __init__(self, num_users, num_items, embedding_dim=64, diffusion_steps=3, add_self_loops=False):
super().__init__()
# Number of users and items in the graph
self.num_users = num_users
self.num_items = num_items
# Embedding dimension for user and item nodes
self.embedding_dim = embedding_dim
# Number of diffusion steps (K) for multi-scale diffusion
self.diffusion_steps = diffusion_steps
# Whether to add self-loops to the adjacency matrix
self.add_self_loops = add_self_loops
# Initialize embeddings for users and items (E^0)
self.users_emb = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_dim) # e_u^0
self.items_emb = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.embedding_dim) # e_i^0
# Initialize embedding weights with a normal distribution (mean=0, std=0.1)
nn.init.normal_(self.users_emb.weight, std=0.1)
nn.init.normal_(self.items_emb.weight, std=0.1)
def forward(self, edge_index: SparseTensor):
# Compute the symmetrically normalized adjacency matrix (A_hat or \tilde{A})
edge_index_norm = gcn_norm(edge_index, add_self_loops=self.add_self_loops)
# Get initial embeddings E^0 for all nodes (users and items)
emb_0 = torch.cat([self.users_emb.weight, self.items_emb.weight]) # E^0
# List to store embeddings at each diffusion step (E^1, E^2, ..., E^K)
embs = [emb_0]
# Initialize the current embeddings to E^0
emb_k = emb_0
# Perform multi-scale diffusion for K steps
for _ in range(self.diffusion_steps):
# Propagate embeddings and update emb_k using the normalized adjacency matrix
emb_k = self.propagate(edge_index_norm, x=emb_k)
# Save embeddings at each diffusion step for later use
embs.append(emb_k)
# Stack all the embeddings along the second dimension (stack E^0, E^1, ..., E^K)
embs = torch.stack(embs, dim=1)
# Calculate the final embeddings by taking the mean of all diffusion embeddings (E^K)
emb_final = torch.mean(embs, dim=1) # E^K
# Split the final embeddings into user embeddings (e_u^K) and item embeddings (e_i^K)
users_emb_final, items_emb_final = torch.split(emb_final, [self.num_users, self.num_items]) # Splits into e_u^K and e_i^K
# Returns the final embeddings for users (e_u^K), initial embeddings for users (e_u^0),
# final embeddings for items (e_i^K), and initial embeddings for items (e_i^0)
return users_emb_final, self.users_emb.weight, items_emb_final, self.items_emb.weight
def message(self, x_j: Tensor) -> Tensor:
# The message function is an identity function, i.e., it returns x_j itself
return x_j
def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
# Perform message passing and aggregation using the normalized adjacency matrix (A_hat or \tilde{A})
return matmul(adj_t, x)
model = LightGCN(671, 9125)
def get_movie_recommendations(user_id, num_recomms):
# Map the user ID to the corresponding index in the model's user embeddings
user_index = user_mapping[user_id]
# Retrieve the user embedding for the specified user
user_embedding = model.users_emb.weight[user_index]
# Calculate scores for all items using the user embedding
scores = model.items_emb.weight @ user_embedding
# Get the indices of the highest scores, including positive items and additional recommendations
values, indices = torch.topk(scores, k=len(user_pos_items[user_id]) + num_recomms)
# Retrieve the recommended movies that the user has already rated highly
rated_movies = [index.cpu().item() for index in indices if index in user_pos_items[user_id]][:num_recomms]
rated_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in rated_movies]
# Retrieve the suggested movies for the user that they have not rated
suggested_movies = [index.cpu().item() for index in indices if index not in user_pos_items[user_id]][:num_recomms]
suggested_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in suggested_movies]
return rated_movie_ids, suggested_movie_ids
addr = './'
model.load_state_dict(torch.load(addr + 'model.pth'))
final_movies_file = open(addr + 'final_movies.pkl', "rb")
final_movies = pickle.load(final_movies_file)
final_movies_file.close()
movie_embeds_file = open(addr + 'movie_embeds.pkl', "rb")
movie_embeds = pickle.load(movie_embeds_file)
movie_embeds_file.close()
btree_file = open(addr + 'btree.pkl', "rb")
btree = pickle.load(btree_file)
btree_file.close()
user_embeds_file = open(addr + 'user_embeds.pkl', "rb")
user_embeds = pickle.load(user_embeds_file)
user_embeds_file.close()
user_mapping_file = open(addr + 'user_mapping.pkl', "rb")
user_mapping = pickle.load(user_mapping_file)
user_mapping_file.close()
movie_mapping_file = open(addr + 'movie_mapping.pkl', "rb")
movie_mapping = pickle.load(movie_mapping_file)
movie_mapping_file.close()
user_pos_items_file = open(addr + 'user_pos_items.pkl', "rb")
user_pos_items = pickle.load(user_pos_items_file)
user_pos_items_file.close()
def create_user_embedding(movie_ratings, movies_df):
# Convert the movie_ratings dictionary to a dataframe
user_ratings_df = pd.DataFrame.from_dict(movie_ratings, orient='index', columns=['rating'])
user_ratings_df['movieId'] = user_ratings_df.index
# Merge the user_ratings_df with the movies_df to get the movie embeddings
user_movie_embeddings = user_ratings_df.merge(movies_df, on='movieId', how='left')
print(user_ratings_df)
print(user_movie_embeddings)
# Multiply the ratings with the movie embeddings
user_movie_embeddings = user_movie_embeddings.iloc[:, 2:].values * user_movie_embeddings['rating'].values[:, np.newaxis]
# Calculate the user embedding as the sum of the movie embeddings
user_embedding = np.sum(user_movie_embeddings, axis=0)
np.nan_to_num(user_embedding, 0)
print(user_movie_embeddings.shape)
return user_embedding
def find_closest_user(user_embedding, tree, user_embeddings):
# Query the BallTree to find the closest user to the given user_embedding
_, closest_user_index = tree.query([user_embedding], k=1)
# Get the embedding of the closest user
closest_user_embedding = user_embeddings.iloc[closest_user_index[0][0]]
return closest_user_embedding
def drop_non_numerical_columns(df):
non_numerical_columns = df.select_dtypes(exclude=[float, int]).columns
return df.drop(columns=non_numerical_columns, inplace=False)
def output_list(input_dict, movies_df = movie_embeds, tree = btree, user_embeddings = user_embeds, movies = final_movies):
movie_ratings = {}
for movie_title in input_dict:
matching_title = process.extractOne(movie_title, final_movies['title'].values, scorer=fuzz.partial_token_sort_ratio)[0]
index = movies.index[movies['title'] == matching_title].tolist()[0]
movie_ratings[index] = input_dict[movie_title]
user_embed = create_user_embedding(movie_ratings, movie_embeds)
# Call the find_closest_user function with the pre-built BallTree
closest_user_embed = find_closest_user(user_embed, tree, user_embeds)
rated_movie_ids, suggested_movie_ids = get_movie_recommendations(closest_user_embed['userId'], 5)
out1 = [movie_id for movie_id in set(rated_movie_ids + suggested_movie_ids) if movie_id not in movie_ratings.keys()]
out2 = [movies['title'][idx] for idx in out1]
return out2
# output_list({1:1,2:2,3:3,4:4,5:5})
|