Spaces:
Sleeping
Sleeping
File size: 3,404 Bytes
06d9388 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import umap.umap_ as umap
import plotly.express as px
import pandas as pd
import random
import numpy
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import os
os.chdir(os.path.dirname(os.path.abspath(__file__)))
movies_df = pd.read_csv("./sampled_movie_dataset/movies_metadata.csv")
##all_genres = movies_df['genres'].unique().tolist() # Adjust the column name if needed
genres = movies_df['genres'].tolist()[671:] # Offset to start at movies
##can't get to work for coloring by genre
def get_genre_for_movie(movie_index):
genres_str = movies_df.iloc[movie_index]['genres']
# You might need to parse genres_str if it's not a simple list
return genres_str # Or a list of genres
print(get_genre_for_movie(20))
def visualize_embeddings_umap(embedding_df, n_neighbors=15, min_dist=0.1, n_components=3):
# Convert Series to DataFrame
#embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))])
# Perform UMAP dimensionality reduction
umap_embedded = umap.UMAP(
n_neighbors=n_neighbors,
min_dist=min_dist,
n_components=n_components,
random_state=42,
).fit_transform(embedding_df.values)
# Plot the UMAP embedding
umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2', 'UMAP Dimension 3'])
umap_df['Label'] = embedding_df.index
color = [0]*671 + [1]*9025
umap_df['color'] = color
# Plot the UMAP embedding using Plotly Express
fig = px.scatter_3d(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',z='UMAP Dimension 3',color='color',hover_data=['Label'], title='UMAP Visualization of Embeddings')
return fig
def visualize_embeddings_tsne(embedding_df, n_components=3, perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0):
# Perform t-SNE dimensionality reduction
tsne_embedded = TSNE(
n_components=n_components,
perplexity=perplexity,
early_exaggeration=early_exaggeration,
learning_rate=learning_rate,
random_state=42,
).fit_transform(embedding_df.values)
# Plot the t-SNE embedding
tsne_df = pd.DataFrame(tsne_embedded, columns=[f't-SNE Dimension {i+1}' for i in range(n_components)])
tsne_df['Label'] = embedding_df.index
# Add color column (adjust how colors are applied based on your data)
tsne_df['color'] = [0]*671 + [1]*9025
fig = px.scatter_3d(tsne_df, x='t-SNE Dimension 1', y='t-SNE Dimension 2', z='t-SNE Dimension 3', color='color', hover_data=['Label'], title='t-SNE Visualization of Embeddings')
return fig
def visualize_embeddings_pca(embedding_df, n_components=3):
# Perform PCA
pca = PCA(n_components=n_components, random_state=42)
pca_embedded = pca.fit_transform(embedding_df.values)
# Plot the PCA embedding
pca_df = pd.DataFrame(pca_embedded, columns=[f'PCA Dimension {i+1}' for i in range(n_components)])
pca_df['Label'] = embedding_df.index
# Add color column (adjust how colors are applied based on your data)
pca_df['color'] = [0]*671 + [1]*9025
fig = px.scatter_3d(pca_df, x='PCA Dimension 1', y='PCA Dimension 2', z='PCA Dimension 3', color='color', hover_data=['Label'], title='PCA Visualization of Embeddings')
return fig
def save_visualization(fig, filename):
fig.write_html(f"{filename}.html")
|