import streamlit as st from datasets import load_dataset import networkx as nx import numpy as np import pandas as pd dataset = load_dataset("roneneldan/TinyStories") st.markdown('# Short Stories, networks and connections') st.markdown('In this example we consider the semantic similarity between short stories generatited by GenAI.') st.markdown('We study the relationshis between the stories using a network. The laplacian connectivity provides inights about the closeness of the graph') st.markdown('## Short Stories') st.markdown('We are using a sample fo the [TinyStories](roneneldan/TinyStories) dataset from roneneldan work') text_text = dataset['train'][10]['text'] st.write(text_text) st.markdown(':red[' + text_text + ']' ) st.markdown("The next word is red",unsafe_allow_html=True) st.markdown('The threshold changes the level of connectivity in the network. The reange is from 0 (less similar) to 1 (completely similar)') threshhold = st.slider('Threshhold',0.0,1.0,step=0.1) #------------------------------------------------------------- #------------------------------------------------------------- from sentence_transformers import SentenceTransformer, util model = SentenceTransformer('all-MiniLM-L6-v2') # Sentences from the data set #sentences = [item['text'] for item in dataset['train'][:10]] #sentences = [dataset['train'][0],dataset['train'][1],dataset['train'][2]] sentences = [dataset['train'][ii] for ii in range(10)] #Compute embedding embeddings = model.encode(sentences, convert_to_tensor=True) #Compute cosine-similarities cosine_scores = util.cos_sim(embeddings, embeddings) # creating adjacency matrix A = np.zeros((len(sentences),len(sentences))) #Output the pairs with their score for i in range(len(sentences)): for j in range(i): #st.write("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], cosine_scores[i][j])) A[i][j] = cosine_scores[i][j] A[j][i] = cosine_scores[i][j] #G = nx.from_numpy_array(A) G = nx.from_numpy_array(cosine_scores.numpy()>threshhold) st.markdown('We can visualize the similarity between the shorts stories as a network. It the similarity is greater than the threshold, the two nodes are conencted') #------------------------------------------------------------- #------------------------------------------------------------- # ego_graph.py # An example of how to plot a node's ego network # (egonet). This indirectly showcases slightly more involved # interoperability between streamlit-agraph and networkx. # An egonet can be # created from (almost) any network (graph), # and exemplifies the # concept of a subnetwork (subgraph): # A node's egonet is the (sub)network comprised of the focal node # and all the nodes to whom it is adjacent. The edges included # in the egonet are those nodes are both included in the aforementioned # nodes. # Use the following command to launch the app # streamlit run .py # standard library dependencies from operator import itemgetter # external dependencies import networkx as nx from streamlit_agraph import agraph, Node, Edge, Config # First create a graph using the Barabasi-Albert model n = 2000 m = 2 #G = nx.generators.barabasi_albert_graph(n, m, seed=2023) # Then find the node with the largest degree; # This node's egonet will be the focus of this example. node_and_degree = G.degree() most_connected_node = sorted(G.degree, key=lambda x: x[1], reverse=True)[0] degree = G.degree(most_connected_node) # Create egonet for the focal node hub_ego = nx.ego_graph(G, most_connected_node[0]) # Now create the equivalent Node and Edge lists nodes = [Node(title=str(sentences[i]['text']), id=i, label='node_'+str(i), size=20) for i in hub_ego.nodes] edges = [Edge(source=i, target=j, type="CURVE_SMOOTH") for (i,j) in G.edges if i in hub_ego.nodes and j in hub_ego.nodes] config = Config(width=500, height=500, directed=True, nodeHighlightBehavior=False, highlightColor="#F7A7A6", # or "blue" collapsible=False, node={'labelProperty':'label'}, # **kwargs e.g. node_size=1000 or node_color="blue" ) return_value = agraph(nodes=nodes, edges=edges, config=config) st.markdown('The Laplacian centrality is a measure of closeness') st.write(str(nx.laplacian_centrality(G))) df_lc = pd.from_dict(nx.laplacian_centrality(G)) st.bar_chart(df_lc)