File size: 3,404 Bytes
06d9388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import umap.umap_ as umap
import plotly.express as px
import pandas as pd
import random
import numpy
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import os

os.chdir(os.path.dirname(os.path.abspath(__file__)))

movies_df = pd.read_csv("./sampled_movie_dataset/movies_metadata.csv")


##all_genres = movies_df['genres'].unique().tolist()  # Adjust the column name if needed
genres = movies_df['genres'].tolist()[671:] # Offset to start at movies



##can't get to work for coloring by genre
def get_genre_for_movie(movie_index):
    genres_str = movies_df.iloc[movie_index]['genres']
    # You might need to parse genres_str if it's not a simple list
    return genres_str  # Or a list of genres 

print(get_genre_for_movie(20))



def visualize_embeddings_umap(embedding_df, n_neighbors=15, min_dist=0.1, n_components=3):
    # Convert Series to DataFrame
    #embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))])
    # Perform UMAP dimensionality reduction
    umap_embedded = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        random_state=42,
    ).fit_transform(embedding_df.values)


    # Plot the UMAP embedding
    umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2', 'UMAP Dimension 3'])
    umap_df['Label'] = embedding_df.index


    color = [0]*671 + [1]*9025
    umap_df['color'] = color

    # Plot the UMAP embedding using Plotly Express
    fig = px.scatter_3d(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',z='UMAP Dimension 3',color='color',hover_data=['Label'], title='UMAP Visualization of Embeddings')
    return fig

def visualize_embeddings_tsne(embedding_df, n_components=3, perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0):
    # Perform t-SNE dimensionality reduction
    tsne_embedded = TSNE(
        n_components=n_components,
        perplexity=perplexity,
        early_exaggeration=early_exaggeration,
        learning_rate=learning_rate,
        random_state=42,
    ).fit_transform(embedding_df.values)

    # Plot the t-SNE embedding
    tsne_df = pd.DataFrame(tsne_embedded, columns=[f't-SNE Dimension {i+1}' for i in range(n_components)])
    tsne_df['Label'] = embedding_df.index

    # Add color column (adjust how colors are applied based on your data)
    tsne_df['color'] = [0]*671 + [1]*9025 

    fig = px.scatter_3d(tsne_df, x='t-SNE Dimension 1', y='t-SNE Dimension 2', z='t-SNE Dimension 3', color='color', hover_data=['Label'], title='t-SNE Visualization of Embeddings')
    return fig


def visualize_embeddings_pca(embedding_df, n_components=3):
    # Perform PCA 
    pca = PCA(n_components=n_components, random_state=42)
    pca_embedded = pca.fit_transform(embedding_df.values)

    # Plot the PCA embedding
    pca_df = pd.DataFrame(pca_embedded, columns=[f'PCA Dimension {i+1}' for i in range(n_components)])
    pca_df['Label'] = embedding_df.index

    # Add color column (adjust how colors are applied based on your data)
    pca_df['color'] = [0]*671 + [1]*9025 

    fig = px.scatter_3d(pca_df, x='PCA Dimension 1', y='PCA Dimension 2', z='PCA Dimension 3', color='color', hover_data=['Label'], title='PCA Visualization of Embeddings')
    return fig




def save_visualization(fig, filename):
    fig.write_html(f"{filename}.html")