|
import pip |
|
|
|
def install(package): |
|
if hasattr(pip, 'main'): |
|
pip.main(['install', package]) |
|
else: |
|
pip._internal.main(['install', package]) |
|
|
|
print("Everything goes bang.") |
|
install('torch_geometric') |
|
install('torch_scatter') |
|
install('torch_sparse') |
|
print("It's havoc baby!") |
|
|
|
import pickle |
|
import numpy as np |
|
import pandas as pd |
|
import random |
|
from tqdm import tqdm |
|
import matplotlib.pyplot as plt |
|
from sklearn.model_selection import train_test_split |
|
import torch |
|
from torch import nn, optim, Tensor |
|
from torch_sparse import SparseTensor, matmul |
|
from torch_geometric.utils import structured_negative_sampling |
|
from torch_geometric.data import download_url, extract_zip |
|
from torch_geometric.nn.conv.gcn_conv import gcn_norm |
|
from torch_geometric.nn.conv import MessagePassing |
|
from torch_geometric.typing import Adj |
|
from sklearn.neighbors import BallTree |
|
from thefuzz import fuzz |
|
from thefuzz import process |
|
|
|
class LightGCN(MessagePassing): |
|
def __init__(self, num_users, num_items, embedding_dim=64, diffusion_steps=3, add_self_loops=False): |
|
super().__init__() |
|
|
|
|
|
self.num_users = num_users |
|
self.num_items = num_items |
|
|
|
|
|
self.embedding_dim = embedding_dim |
|
|
|
|
|
self.diffusion_steps = diffusion_steps |
|
|
|
|
|
self.add_self_loops = add_self_loops |
|
|
|
|
|
self.users_emb = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_dim) |
|
self.items_emb = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.embedding_dim) |
|
|
|
|
|
nn.init.normal_(self.users_emb.weight, std=0.1) |
|
nn.init.normal_(self.items_emb.weight, std=0.1) |
|
|
|
def forward(self, edge_index: SparseTensor): |
|
|
|
edge_index_norm = gcn_norm(edge_index, add_self_loops=self.add_self_loops) |
|
|
|
|
|
emb_0 = torch.cat([self.users_emb.weight, self.items_emb.weight]) |
|
|
|
|
|
embs = [emb_0] |
|
|
|
|
|
emb_k = emb_0 |
|
|
|
|
|
for _ in range(self.diffusion_steps): |
|
|
|
emb_k = self.propagate(edge_index_norm, x=emb_k) |
|
|
|
embs.append(emb_k) |
|
|
|
|
|
embs = torch.stack(embs, dim=1) |
|
|
|
|
|
emb_final = torch.mean(embs, dim=1) |
|
|
|
|
|
users_emb_final, items_emb_final = torch.split(emb_final, [self.num_users, self.num_items]) |
|
|
|
|
|
|
|
return users_emb_final, self.users_emb.weight, items_emb_final, self.items_emb.weight |
|
|
|
def message(self, x_j: Tensor) -> Tensor: |
|
|
|
return x_j |
|
|
|
def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor: |
|
|
|
return matmul(adj_t, x) |
|
|
|
|
|
model = LightGCN(671, 9125) |
|
|
|
def get_movie_recommendations(user_id, num_recomms): |
|
|
|
user_index = user_mapping[user_id] |
|
|
|
|
|
user_embedding = model.users_emb.weight[user_index] |
|
|
|
|
|
scores = model.items_emb.weight @ user_embedding |
|
|
|
|
|
values, indices = torch.topk(scores, k=len(user_pos_items[user_id]) + num_recomms) |
|
|
|
|
|
rated_movies = [index.cpu().item() for index in indices if index in user_pos_items[user_id]][:num_recomms] |
|
rated_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in rated_movies] |
|
|
|
|
|
suggested_movies = [index.cpu().item() for index in indices if index not in user_pos_items[user_id]][:num_recomms] |
|
suggested_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in suggested_movies] |
|
|
|
return rated_movie_ids, suggested_movie_ids |
|
|
|
addr = './' |
|
|
|
model.load_state_dict(torch.load(addr + 'model.pth')) |
|
|
|
final_movies_file = open(addr + 'final_movies.pkl', "rb") |
|
final_movies = pickle.load(final_movies_file) |
|
final_movies_file.close() |
|
|
|
movie_embeds_file = open(addr + 'movie_embeds.pkl', "rb") |
|
movie_embeds = pickle.load(movie_embeds_file) |
|
movie_embeds_file.close() |
|
|
|
btree_file = open(addr + 'btree.pkl', "rb") |
|
btree = pickle.load(btree_file) |
|
btree_file.close() |
|
|
|
user_embeds_file = open(addr + 'user_embeds.pkl', "rb") |
|
user_embeds = pickle.load(user_embeds_file) |
|
user_embeds_file.close() |
|
|
|
user_mapping_file = open(addr + 'user_mapping.pkl', "rb") |
|
user_mapping = pickle.load(user_mapping_file) |
|
user_mapping_file.close() |
|
|
|
movie_mapping_file = open(addr + 'movie_mapping.pkl', "rb") |
|
movie_mapping = pickle.load(movie_mapping_file) |
|
movie_mapping_file.close() |
|
|
|
user_pos_items_file = open(addr + 'user_pos_items.pkl', "rb") |
|
user_pos_items = pickle.load(user_pos_items_file) |
|
user_pos_items_file.close() |
|
|
|
def create_user_embedding(movie_ratings, movies_df): |
|
|
|
user_ratings_df = pd.DataFrame.from_dict(movie_ratings, orient='index', columns=['rating']) |
|
user_ratings_df['movieId'] = user_ratings_df.index |
|
print(user_ratings_df) |
|
print(user_movie_embeddings) |
|
|
|
user_movie_embeddings = user_ratings_df.merge(movies_df, on='movieId', how='left') |
|
|
|
|
|
user_movie_embeddings = user_movie_embeddings.iloc[:, 2:].values * user_movie_embeddings['rating'].values[:, np.newaxis] |
|
|
|
|
|
user_embedding = np.sum(user_movie_embeddings, axis=0) |
|
np.nan_to_num(user_embedding, 0) |
|
print(user_movie_embeddings.shape) |
|
return user_embedding |
|
|
|
def find_closest_user(user_embedding, tree, user_embeddings): |
|
|
|
_, closest_user_index = tree.query([user_embedding], k=1) |
|
|
|
|
|
closest_user_embedding = user_embeddings.iloc[closest_user_index[0][0]] |
|
|
|
return closest_user_embedding |
|
|
|
|
|
def drop_non_numerical_columns(df): |
|
non_numerical_columns = df.select_dtypes(exclude=[float, int]).columns |
|
return df.drop(columns=non_numerical_columns, inplace=False) |
|
|
|
def output_list(input_dict, movies_df = movie_embeds, tree = btree, user_embeddings = user_embeds, movies = final_movies): |
|
movie_ratings = {} |
|
for movie_title in input_dict: |
|
matching_title = process.extractOne(movie_title, final_movies['title'].values, scorer=fuzz.partial_token_sort_ratio)[0] |
|
index = movies.index[movies['title'] == matching_title].tolist()[0] |
|
movie_ratings[index] = input_dict[movie_title] |
|
user_embed = create_user_embedding(movie_ratings, movie_embeds) |
|
|
|
closest_user_embed = find_closest_user(user_embed, tree, user_embeds) |
|
rated_movie_ids, suggested_movie_ids = get_movie_recommendations(closest_user_embed['userId'], 5) |
|
out1 = [movie_id for movie_id in set(rated_movie_ids + suggested_movie_ids) if movie_id not in movie_ratings.keys()] |
|
out2 = [movies['title'][idx] for idx in out1] |
|
return out2 |
|
|
|
|
|
|
|
|