File size: 8,727 Bytes
28be794
 
3454357
8def105
f482a30
bc5676d
3a4738e
97ede01
3454357
f4547e9
 
f482a30
3a4738e
 
f4547e9
f482a30
f4547e9
 
72c7204
28be794
3a4738e
 
28be794
 
3454357
6d919a4
 
14448db
5fdacf8
6764e28
5fdacf8
 
6764e28
76053a8
6d919a4
 
 
 
 
6764e28
6d919a4
 
 
77bad41
6d919a4
5f78421
6764e28
6d919a4
6764e28
6d919a4
14448db
3017076
 
5ec2e5a
 
3017076
 
6d919a4
6764e28
6d919a4
14448db
6d919a4
3017076
6d919a4
 
3017076
 
6d919a4
 
 
 
 
 
 
 
 
 
 
 
 
 
5ec2e5a
 
 
 
 
 
 
 
 
 
6d919a4
 
 
77bad41
6d919a4
 
2733741
48e5161
3454357
 
 
 
 
 
 
 
1c00bf5
3454357
 
35b1251
 
 
 
 
 
 
 
 
 
28be794
a203666
8cc1051
35b1251
 
28be794
35b1251
 
 
28be794
8e36b55
846c9b9
35b1251
6143265
 
 
 
 
35b1251
 
 
6143265
28be794
35b1251
 
28be794
35b1251
28be794
6143265
35b1251
28be794
 
35b1251
 
28be794
35b1251
 
 
 
 
3454357
 
 
 
5ec2e5a
 
7e5fed2
 
 
 
 
 
 
 
f50ff81
3017076
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import streamlit as st
import pandas as pd
import altair as alt
from PIL import Image
#!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np
import sys
import json
#from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, evaluation, losses, InputExample
from torch.utils.data import IterableDataset
import tqdm
#from torch.utils.data import Dataset
import random
from shutil import copyfile
from urllib.error import URLError

## TEXT ######################################################################################################################

# Load document embeddings

# set up title and sidebar
st.title (" Nashville Analytics Summit Conference Helper")# (" Your top 3 Important Sessions")

st.markdown("**Problem** 🧐:")
st.markdown("Since its inception in 2013, Nashville Analytics Summit has seen a growth of over 488%. The Summit prides its itself as the fastest growing locally grown tech events in the south region. With an increasing number of participants and dozens of talks covering a myriad of topics, there is a need to tailor participants needs to their interests")
st.markdown("---")
st.markdown("**Solution**💡:" )
st.markdown("Develop an application in which users can input the description of areas of interest and app returns the top three Sessions matching the description requested.")
st.markdown("---")
st.markdown("** Approach** 🗝️ :") 
st.markdown("* For the approach, I used a transformer model, multi-qa-MiniLM-L6-cos-v1, that uses sentence similarity to match the description of the event and the input description.")
st.markdown("* The dataset used is Nashville Analytics Summit descriptions of the presentations, which include the Unique ID, Name of presenter, Description of presentation, Activity Code, Start Time, End Time, Location Name")

st.markdown("---")

#st.markdown("### The Details")

# section 1: the context, problem; how to address
st.markdown("### Problem")
st.markdown("With the growth of the Nashville Analytics Summit participation every year, there have been an increase in the number participants, the number of talks/topics have exponentially increased over time covering a myriad of current issues and developments, analytics, data science AI, career growth, new tools and much more. This has made it challenging for the participants to easily navigate through the events and attend the sessions of interest before reading through the sessions descriptions. Struggling finding the Session that Matches your personal interests? Transformers to the rescue!!! 🦸‍♂️ .")

dificult_reading = Image.open('dificult_reading.png')
st.image(dificult_reading, caption='Reading sessions descriptions.')

st.markdown("---")
# section 2: how can transformers help?
st.markdown("### How can Transformers Help?🪄 ")
st.markdown("* Semantic search applies the user intent, context, and conceptual meanings to match a user query to the corresponding content. It uses vector search to returns results that aim to match a users query. The components work together to retrieve and rank the results based on the meaning.")

st.markdown("**Sentence Similarity**")

st.markdown("* In this  a similar project we used a dataset from the Nashville Analytics Summit")
st.markdown("* Given a search phrase, the model search for an abstract that marches the search phrase using cosine similarity. ")

st.markdown("---")
# section 4: The process
st.markdown("### The Process 🔍")

st.markdown("A participant inputs a search phrase of the sessions he/she would like to attend and the semantic serach would return the title of the session, the abstract of the session, the room number and the time the session will be offered.")


st.markdown("The output of the search query \
	- Once the user input their phrase or keywords to search sessions of interest, the appropriate sessions are selected by finding the top 3 sessions with the highest similarity scores to the phrase or keyword.")

#st.markdown("3. Send (claim, evidence) pairs to a transformer model. Have the model predict whether each evidence supports, refutes, or is not relevant to the claim. (📍 YOU ARE HERE!)")

#st.markdown("4. Report back to the user: The supporting evidence for the claim (if any), the refuting evidence for the claim (if any). If no relevant evidence is found, report that the claim cannot be supported or refuted by current evidence.")


# section 5: my work
#st.markdown("## Climate Claim Fact-Checking with Transformers")

#st.markdown("My work focuses on step 3 of the process: Training a transformer model to accurately categorize (claim, evidence) as:")
#st.markdown("* evidence *supports* (entails) claim")
#st.markdown("* evidence *refutes* (contradicts) claim")
#st.markdown("* evidence *does not provide enough info to support or refute* (neutral) claim")
#st.markdown("For this project, I fine-tune ClimateBERT (4) on the text entailment task")
st.markdown("---")
# section 6: analysis
st.markdown("## Critical Analysis")
st.markdown("What can be done to improve the accuracy?")
st.markdown("* Giving more data to be used to create embeddings, this would improve the model performance.")
st.markdown("* A few pretrained Sentence Similarity models could be used and pick the one with higher accuracy and generalize well to new semantics!")
st.markdown("* Creating an app that can be multilingual and output the search in users preferred language")

st.markdown("---")


## EXAMPLE ###################################################################################################################

st.markdown("## Let's try it out!")

st.markdown("This application is a dashboard for displaying your top 3 matching Sessions at the Nashville summit")

doc_emb = np.loadtxt("abstract-embed.txt", dtype=float)


    # Load data
df = pd.read_csv("sessions.csv", usecols=['Unique ID', 'Name', 'Description', 'Activity Code', 'Start Time', 'End Time', 'Location Name'])

        # front end elements of the web page
html_temp = """
<div style ="background-color:lightblue;padding:13px">
<h1 style ="color:white;text-align:center;">Sentence Similarity App for Nashville Analytic Summit 2022</h1>
</div>
        """
def main():
        # display the front end aspect
    st.markdown(html_temp, unsafe_allow_html = True)

        # Get attributes from dataframe
    docs = list(df["Description"])
    titles = list(df["Name"])
    start_times = list(df["Start Time"])
    end_times = list(df["End Time"])
    locations = list(df["Location Name"])
# Query
# Load the model
    model = pipeline('text-classification')#('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

    query =  st.text_input("Enter your query: ")

    if query:
#st.text_area('Text area')
        #age = st.number_input("Age in Years")
#Encode query and documents
        query_emb = model(query)#.astype(float)
        
    #Compute dot score between query and all document embeddings
        #scores = util.dot_score(query_emb, doc_emb.astype(float))#[0].cpu().tolist()
        
        #cos_scores = util.pytorch_cos_sim(query_emb, doc_emb)[0]
        
        scores = util.pytorch_cos_sim(query_emb, doc_emb)[0]

    #Combine docs & scores with other attributes
        doc_score_pairs = list(zip(docs, scores, titles, start_times, end_times, locations))
        

    # top_k results to return
        top_k=3

        print(" Your top", top_k, "most similar sessions in the Summit:")

    #Sort the results in decreasing order and get the first top_k
        doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)


    #Output presentation recommendations
        for doc, score, title, start_time, end_time, location in doc_score_pairs[:top_k]:

            st.write("Score: %f" %score)
            st.write("Title: %s" %title)
            st.write("Abstract: %s" %doc)
            st.write("Location: %s" %location)
            st.write(f"From {start_time} to {end_time}")
            st.write('\n')


if __name__ == "__main__":
    main()
    
st.markdown("---")
# References + Resource Links
st.markdown("## Resource Links")

st.markdown("### References")
st.markdown("1. https://www.sbert.net/examples/applications/semantic-search/README.html")
st.markdown("2. https://www.sbert.net/docs/pretrained-models/msmarco-v3.html")
st.markdown("3. Semantic search [colab](https://colab.research.google.com/drive/12cn5Oo0v3HfQQ8Tv6-ukgxXSmT3zl35A?usp=sharing)")
st.markdown("4. [project code on github](https://github.com/vanderbilt-data-science/abstract-search)")