In [2]:
import sys

sys.path.insert(0, "./interactive_tutorials")

import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import itertools
import requests
import sys

import torch
import torch.nn.functional as F
from torch.nn import Linear
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from sentence_transformers import SentenceTransformer
from torch_geometric.data import HeteroData
import yaml

print(torch.__version__)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

2.2.1+cu121


In [3]:
data = torch.load("./PyGdata.pt")
data.edge_index_dict




{('user',
 'rates',
 'movie'): tensor([[ 0, 0, 0, ..., 670, 670, 670],
 [ 0, 1, 2, ..., 1327, 1329, 2941]], device='cuda:0'),
 ('movie',
 'rev_rates',
 'user'): tensor([[ 0, 1, 2, ..., 1327, 1329, 2941],
 [ 0, 0, 0, ..., 670, 670, 670]], device='cuda:0')}

{('user',
 'rates',
 'movie'): tensor([[ 0, 0, 0, ..., 670, 670, 670],
 [ 0, 1, 2, ..., 1327, 1329, 2941]], device='cuda:0'),
 ('movie',
 'rev_rates',
 'user'): tensor([[ 0, 1, 2, ..., 1327, 1329, 2941],
 [ 0, 0, 0, ..., 670, 670, 670]], device='cuda:0')}

In [4]:
class GNNEncoder(torch.nn.Module):
 def __init__(self, hidden_channels, out_channels):
 super().__init__()
 # these convolutions have been replicated to match the number of edge types
 self.conv1 = SAGEConv((-1, -1), hidden_channels)
 self.conv2 = SAGEConv((-1, -1), out_channels)

 def forward(self, x, edge_index):
 x = self.conv1(x, edge_index).relu()
 x = self.conv2(x, edge_index)
 return x


In [5]:
class EdgeDecoder(torch.nn.Module):
 def __init__(self, hidden_channels):
 super().__init__()
 self.lin1 = Linear(2 * hidden_channels, hidden_channels)
 self.lin2 = Linear(hidden_channels, 1)

 def forward(self, z_dict, edge_label_index):
 row, col = edge_label_index
 # concat user and movie embeddings
 z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)
 # concatenated embeddings passed to linear layer
 z = self.lin1(z).relu()
 z = self.lin2(z)
 return z.view(-1)

In [6]:
class Model(torch.nn.Module):
 def __init__(self, hidden_channels):
 super().__init__()
 self.encoder = GNNEncoder(hidden_channels, hidden_channels)
 self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
 self.decoder = EdgeDecoder(hidden_channels)

 def forward(self, x_dict, edge_index_dict, edge_label_index):
 # z_dict contains dictionary of movie and user embeddings returned from GraphSage
 z_dict = self.encoder(x_dict, edge_index_dict)
 return self.decoder(z_dict, edge_label_index)

In [10]:
model = Model(hidden_channels=32).to(device)
model2 = Model(hidden_channels=32).to(device)
model.load_state_dict(torch.load("PyGTrainedModelState.pt"))
model.eval()

total_users = data['user'].num_nodes 
total_movies = data['movie'].num_nodes 
print(data)


HeteroData(
 user={ x=[671, 671] },
 movie={ x=[9025, 404] },
 (user, rates, movie)={
 edge_index=[2, 99810],
 edge_label=[99810],
 },
 (movie, rev_rates, user)={ edge_index=[2, 99810] }
)


In [9]:
movie_recs = []
for user_id in tqdm(range(0, total_users)):
 user_row = torch.tensor([user_id] * total_movies)
 all_movie_ids = torch.arange(total_movies)
 edge_label_index = torch.stack([user_row, all_movie_ids], dim=0)
 pred = model(data.x_dict, data.edge_index_dict,
 edge_label_index)
 pred = pred.clamp(min=0, max=5)
 # we will only select movies for the user where the predicting rating is =5
 rec_movie_ids = (pred == 5).nonzero(as_tuple=True)
 top_ten_recs = [rec_movies for rec_movies in rec_movie_ids[0].tolist()[:10]]
 movie_recs.append({'user': user_id, 'rec_movies': top_ten_recs})

100%|██████████| 671/671 [00:05<00:00, 121.64it/s]


In [15]:
metadata_path = './sampled_movie_dataset/movies_metadata.csv'
df = pd.read_csv(metadata_path)
df.columns

def get_movie_title(movie_id):
 """Looks up a movie title by its ID in the DataFrame."""

 row = df[df['id'] == movie_id]

 if not row.empty:
 return row['title'].iloc[0] # Get the title from the first matching row
 else:
 return "Movie not found"
 
print(get_movie_title(14))

Movie not found


 df = pd.read_csv(metadata_path)


In [26]:

movie_recs_df = pd.DataFrame(movie_recs)
#movie_recs_df = movie_recs_df.set_index('id').join(df[['title']].set_index('id'), how='left')
print(movie_recs_df.head()) 
print(movie_recs_df.columns) 

 user rec_movies
0 0 [14, 85, 101, 106, 111, 131, 132, 150, 210, 216]
1 1 [13, 45, 95, 108, 109, 126, 130, 132, 213, 220]
2 2 [562, 571, 894, 1013, 1169, 1289, 1378, 1405, ...
3 3 [126, 137, 502, 571, 616, 696, 811, 966, 999, ...
4 4 [364, 436, 493, 502, 509, 706, 781, 811, 1244,...
Index(['user', 'rec_movies'], dtype='object')
