Spaces:

sohvren
/

MovieRecommenderV2

Sleeping

App Files Files Community

AJ-Gazin commited on Apr 8, 2024

Commit

06d9388

1 Parent(s): 960b542

Added more of main program

Browse files

Files changed (10) hide show

.gitignore +3 -0
PredictionGenerator.ipynb +315 -0
PyGTrainedModelState.pt +3 -0
PyGdata.pt +3 -0
model_def.py +43 -0
movie_embeddings.pt +3 -0
movie_embeddings_concat.pt +3 -0
requirements.txt +0 -0
visualizer.py +98 -0
viz_utils.py +96 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# .gitignore
+.env
+creds.dat

PredictionGenerator.ipynb ADDED Viewed

	@@ -0,0 +1,315 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2.2.1+cu121\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "\n",
+    "sys.path.insert(0, \"./interactive_tutorials\")\n",
+    "\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import itertools\n",
+    "import requests\n",
+    "import sys\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "from torch.nn import Linear\n",
+    "import torch_geometric.transforms as T\n",
+    "from torch_geometric.nn import SAGEConv, to_hetero\n",
+    "from torch_geometric.transforms import RandomLinkSplit, ToUndirected\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "from torch_geometric.data import HeteroData\n",
+    "import yaml\n",
+    "\n",
+    "print(torch.__version__)\n",
+    "\n",
+    "\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('user',\n",
+       "  'rates',\n",
+       "  'movie'): tensor([[   0,    0,    0,  ...,  670,  670,  670],\n",
+       "         [   0,    1,    2,  ..., 1327, 1329, 2941]], device='cuda:0'),\n",
+       " ('movie',\n",
+       "  'rev_rates',\n",
+       "  'user'): tensor([[   0,    1,    2,  ..., 1327, 1329, 2941],\n",
+       "         [   0,    0,    0,  ...,  670,  670,  670]], device='cuda:0')}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = torch.load(\"./PyGdata.pt\")\n",
+    "data.edge_index_dict\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('user',\n",
+       "  'rates',\n",
+       "  'movie'): tensor([[   0,    0,    0,  ...,  670,  670,  670],\n",
+       "         [   0,    1,    2,  ..., 1327, 1329, 2941]], device='cuda:0'),\n",
+       " ('movie',\n",
+       "  'rev_rates',\n",
+       "  'user'): tensor([[   0,    1,    2,  ..., 1327, 1329, 2941],\n",
+       "         [   0,    0,    0,  ...,  670,  670,  670]], device='cuda:0')}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class GNNEncoder(torch.nn.Module):\n",
+    "    def __init__(self, hidden_channels, out_channels):\n",
+    "        super().__init__()\n",
+    "        # these convolutions have been replicated to match the number of edge types\n",
+    "        self.conv1 = SAGEConv((-1, -1), hidden_channels)\n",
+    "        self.conv2 = SAGEConv((-1, -1), out_channels)\n",
+    "\n",
+    "    def forward(self, x, edge_index):\n",
+    "        x = self.conv1(x, edge_index).relu()\n",
+    "        x = self.conv2(x, edge_index)\n",
+    "        return x\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class EdgeDecoder(torch.nn.Module):\n",
+    "    def __init__(self, hidden_channels):\n",
+    "        super().__init__()\n",
+    "        self.lin1 = Linear(2 * hidden_channels, hidden_channels)\n",
+    "        self.lin2 = Linear(hidden_channels, 1)\n",
+    "\n",
+    "    def forward(self, z_dict, edge_label_index):\n",
+    "        row, col = edge_label_index\n",
+    "        # concat user and movie embeddings\n",
+    "        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)\n",
+    "        # concatenated embeddings passed to linear layer\n",
+    "        z = self.lin1(z).relu()\n",
+    "        z = self.lin2(z)\n",
+    "        return z.view(-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Model(torch.nn.Module):\n",
+    "    def __init__(self, hidden_channels):\n",
+    "        super().__init__()\n",
+    "        self.encoder = GNNEncoder(hidden_channels, hidden_channels)\n",
+    "        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')\n",
+    "        self.decoder = EdgeDecoder(hidden_channels)\n",
+    "\n",
+    "    def forward(self, x_dict, edge_index_dict, edge_label_index):\n",
+    "        # z_dict contains dictionary of movie and user embeddings returned from GraphSage\n",
+    "        z_dict = self.encoder(x_dict, edge_index_dict)\n",
+    "        return self.decoder(z_dict, edge_label_index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "HeteroData(\n",
+      "  user={ x=[671, 671] },\n",
+      "  movie={ x=[9025, 404] },\n",
+      "  (user, rates, movie)={\n",
+      "    edge_index=[2, 99810],\n",
+      "    edge_label=[99810],\n",
+      "  },\n",
+      "  (movie, rev_rates, user)={ edge_index=[2, 99810] }\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = Model(hidden_channels=32).to(device)\n",
+    "model2 = Model(hidden_channels=32).to(device)\n",
+    "model.load_state_dict(torch.load(\"PyGTrainedModelState.pt\"))\n",
+    "model.eval()\n",
+    "\n",
+    "total_users = data['user'].num_nodes \n",
+    "total_movies = data['movie'].num_nodes \n",
+    "print(data)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 671/671 [00:05<00:00, 121.64it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "movie_recs = []\n",
+    "for user_id in tqdm(range(0, total_users)):\n",
+    "    user_row = torch.tensor([user_id] * total_movies)\n",
+    "    all_movie_ids = torch.arange(total_movies)\n",
+    "    edge_label_index = torch.stack([user_row, all_movie_ids], dim=0)\n",
+    "    pred = model(data.x_dict, data.edge_index_dict,\n",
+    "             edge_label_index)\n",
+    "    pred = pred.clamp(min=0, max=5)\n",
+    "    # we will only select movies for the user where the predicting rating is =5\n",
+    "    rec_movie_ids = (pred == 5).nonzero(as_tuple=True)\n",
+    "    top_ten_recs = [rec_movies for rec_movies in rec_movie_ids[0].tolist()[:10]]\n",
+    "    movie_recs.append({'user': user_id, 'rec_movies': top_ten_recs})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Movie not found\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\aj\\AppData\\Local\\Temp\\ipykernel_24552\\778055959.py:2: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df = pd.read_csv(metadata_path)\n"
+     ]
+    }
+   ],
+   "source": [
+    "metadata_path = './sampled_movie_dataset/movies_metadata.csv'\n",
+    "df = pd.read_csv(metadata_path)\n",
+    "df.columns\n",
+    "\n",
+    "def get_movie_title(movie_id):\n",
+    "    \"\"\"Looks up a movie title by its ID in the DataFrame.\"\"\"\n",
+    "\n",
+    "    row = df[df['id'] == movie_id]\n",
+    "\n",
+    "    if not row.empty:\n",
+    "        return row['title'].iloc[0]  # Get the title from the first matching row\n",
+    "    else:\n",
+    "        return \"Movie not found\"\n",
+    "    \n",
+    "print(get_movie_title(14))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   user                                         rec_movies\n",
+      "0     0   [14, 85, 101, 106, 111, 131, 132, 150, 210, 216]\n",
+      "1     1    [13, 45, 95, 108, 109, 126, 130, 132, 213, 220]\n",
+      "2     2  [562, 571, 894, 1013, 1169, 1289, 1378, 1405, ...\n",
+      "3     3  [126, 137, 502, 571, 616, 696, 811, 966, 999, ...\n",
+      "4     4  [364, 436, 493, 502, 509, 706, 781, 811, 1244,...\n",
+      "Index(['user', 'rec_movies'], dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "movie_recs_df = pd.DataFrame(movie_recs)\n",
+    "#movie_recs_df = movie_recs_df.set_index('id').join(df[['title']].set_index('id'), how='left')\n",
+    "print(movie_recs_df.head()) \n",
+    "print(movie_recs_df.columns) "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

PyGTrainedModelState.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d31578ebb232fd63763d00f19051db44f8841e7117b9203c2f316bcae91a5deb
+size 307626

PyGdata.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27cbee2403cf96a9942394e4ad78f229b52bb8ca8c2e16839b3083bcaef877a6
+size 20380556

model_def.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from torch_geometric.nn import SAGEConv, to_hetero, Linear
+from dotenv import load_dotenv
+data = torch.load("./PyGdata.pt")
+class GNNEncoder(torch.nn.Module):
+    def __init__(self, hidden_channels, out_channels):
+        super().__init__()
+        self.conv1 = SAGEConv((-1, -1), hidden_channels)
+        self.conv2 = SAGEConv((-1, -1), out_channels)
+    def forward(self, x, edge_index):
+        x = self.conv1(x, edge_index).relu()
+        x = self.conv2(x, edge_index)
+        return x
+class EdgeDecoder(torch.nn.Module):
+    def __init__(self, hidden_channels):
+        super().__init__()
+        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
+        self.lin2 = Linear(hidden_channels, 1)
+    def forward(self, z_dict, edge_label_index):
+        row, col = edge_label_index
+        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)
+        z = self.lin1(z).relu()
+        z = self.lin2(z)
+        return z.view(-1)
+class Model(torch.nn.Module):
+    def __init__(self, hidden_channels):
+        super().__init__()
+        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
+        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
+        self.decoder = EdgeDecoder(hidden_channels)
+    def forward(self, x_dict, edge_index_dict, edge_label_index):
+        z_dict = self.encoder(x_dict, edge_index_dict)
+        return self.decoder(z_dict, edge_label_index)

movie_embeddings.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e80e58c0f25e6ec7fbe54e9668f233c7ddc3083f268cb21a4b6917ac09332cee
+size 13863625

movie_embeddings_concat.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74b32c1f6eb41a76cc7d392a1fc709b6e5c5002cf5c91c523f1906ca753319a0
+size 14585708

requirements.txt ADDED Viewed

Binary file (3.35 kB). View file

visualizer.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import umap.umap_ as umap
+import plotly.express as px
+import pandas as pd
+import random
+import viz_utils
+import torch
+import torch
+import torch.nn.functional as F
+from torch.nn import Linear
+import torch_geometric.transforms as T
+from torch_geometric.nn import SAGEConv, to_hetero
+from torch_geometric.transforms import RandomLinkSplit, ToUndirected
+from sentence_transformers import SentenceTransformer
+from torch_geometric.data import HeteroData
+import yaml
+data = torch.load("./PyGdata.pt")
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+movies_df = pd.read_csv("./sampled_movie_dataset/movies_metadata.csv")
+class GNNEncoder(torch.nn.Module):
+    def __init__(self, hidden_channels, out_channels):
+        super().__init__()
+        # these convolutions have been replicated to match the number of edge types
+        self.conv1 = SAGEConv((-1, -1), hidden_channels)
+        self.conv2 = SAGEConv((-1, -1), out_channels)
+    def forward(self, x, edge_index):
+        x = self.conv1(x, edge_index).relu()
+        x = self.conv2(x, edge_index)
+        return x
+class EdgeDecoder(torch.nn.Module):
+    def __init__(self, hidden_channels):
+        super().__init__()
+        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
+        self.lin2 = Linear(hidden_channels, 1)
+    def forward(self, z_dict, edge_label_index):
+        row, col = edge_label_index
+        # concat user and movie embeddings
+        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)
+        # concatenated embeddings passed to linear layer
+        z = self.lin1(z).relu()
+        z = self.lin2(z)
+        return z.view(-1)
+class Model(torch.nn.Module):
+    def __init__(self, hidden_channels):
+        super().__init__()
+        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
+        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
+        self.decoder = EdgeDecoder(hidden_channels)
+    def forward(self, x_dict, edge_index_dict, edge_label_index):
+        # z_dict contains dictionary of movie and user embeddings returned from GraphSage
+        z_dict = self.encoder(x_dict, edge_index_dict)
+        return self.decoder(z_dict, edge_label_index)
+model = Model(hidden_channels=32).to(device)
+model2 = Model(hidden_channels=32).to(device)
+model.load_state_dict(torch.load("PyGTrainedModelState.pt"))
+model.eval()
+total_users = data['user'].num_nodes
+total_movies = data['movie'].num_nodes
+print("total users =", total_users)
+print("total movies =", total_movies)
+with torch.no_grad():
+    a = model.encoder(data.x_dict,data.edge_index_dict)
+    user = pd.DataFrame(a['user'].detach().cpu())
+    movie = pd.DataFrame(a['movie'].detach().cpu())
+    embedding_df = pd.concat([user, movie], axis=0)
+movie_index = 20
+title = movies_df.iloc[movie_index]['title']
+print(title)
+fig_umap = viz_utils.visualize_embeddings_umap(embedding_df)
+viz_utils.save_visualization(fig_umap, "./Visualizations/umap_visualization")
+fig_tsne = viz_utils.visualize_embeddings_tsne(embedding_df)
+viz_utils.save_visualization(fig_tsne, "./Visualizations/tsne_visualization")
+fig_pca = viz_utils.visualize_embeddings_pca(embedding_df)
+viz_utils.save_visualization(fig_pca, "./Visualizations/pca_visualization")

viz_utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import umap.umap_ as umap
+import plotly.express as px
+import pandas as pd
+import random
+import numpy
+from sklearn.manifold import TSNE
+from sklearn.decomposition import PCA
+import os
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+movies_df = pd.read_csv("./sampled_movie_dataset/movies_metadata.csv")
+##all_genres = movies_df['genres'].unique().tolist()  # Adjust the column name if needed
+genres = movies_df['genres'].tolist()[671:] # Offset to start at movies
+##can't get to work for coloring by genre
+def get_genre_for_movie(movie_index):
+    genres_str = movies_df.iloc[movie_index]['genres']
+    # You might need to parse genres_str if it's not a simple list
+    return genres_str  # Or a list of genres
+print(get_genre_for_movie(20))
+def visualize_embeddings_umap(embedding_df, n_neighbors=15, min_dist=0.1, n_components=3):
+    # Convert Series to DataFrame
+    #embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))])
+    # Perform UMAP dimensionality reduction
+    umap_embedded = umap.UMAP(
+        n_neighbors=n_neighbors,
+        min_dist=min_dist,
+        n_components=n_components,
+        random_state=42,
+    ).fit_transform(embedding_df.values)
+    # Plot the UMAP embedding
+    umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2', 'UMAP Dimension 3'])
+    umap_df['Label'] = embedding_df.index
+    color = [0]*671 + [1]*9025
+    umap_df['color'] = color
+    # Plot the UMAP embedding using Plotly Express
+    fig = px.scatter_3d(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',z='UMAP Dimension 3',color='color',hover_data=['Label'], title='UMAP Visualization of Embeddings')
+    return fig
+def visualize_embeddings_tsne(embedding_df, n_components=3, perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0):
+    # Perform t-SNE dimensionality reduction
+    tsne_embedded = TSNE(
+        n_components=n_components,
+        perplexity=perplexity,
+        early_exaggeration=early_exaggeration,
+        learning_rate=learning_rate,
+        random_state=42,
+    ).fit_transform(embedding_df.values)
+    # Plot the t-SNE embedding
+    tsne_df = pd.DataFrame(tsne_embedded, columns=[f't-SNE Dimension {i+1}' for i in range(n_components)])
+    tsne_df['Label'] = embedding_df.index
+    # Add color column (adjust how colors are applied based on your data)
+    tsne_df['color'] = [0]*671 + [1]*9025
+    fig = px.scatter_3d(tsne_df, x='t-SNE Dimension 1', y='t-SNE Dimension 2', z='t-SNE Dimension 3', color='color', hover_data=['Label'], title='t-SNE Visualization of Embeddings')
+    return fig
+def visualize_embeddings_pca(embedding_df, n_components=3):
+    # Perform PCA
+    pca = PCA(n_components=n_components, random_state=42)
+    pca_embedded = pca.fit_transform(embedding_df.values)
+    # Plot the PCA embedding
+    pca_df = pd.DataFrame(pca_embedded, columns=[f'PCA Dimension {i+1}' for i in range(n_components)])
+    pca_df['Label'] = embedding_df.index
+    # Add color column (adjust how colors are applied based on your data)
+    pca_df['color'] = [0]*671 + [1]*9025
+    fig = px.scatter_3d(pca_df, x='PCA Dimension 1', y='PCA Dimension 2', z='PCA Dimension 3', color='color', hover_data=['Label'], title='PCA Visualization of Embeddings')
+    return fig
+def save_visualization(fig, filename):
+    fig.write_html(f"{filename}.html")