Spaces:
Runtime error
Runtime error
domenicrosati
commited on
Commit
Β·
03de2e8
1
Parent(s):
82fe24c
use abstracts as a ranking signal
Browse files
app.py
CHANGED
@@ -33,12 +33,18 @@ def remove_html(x):
|
|
33 |
# deduplicate
|
34 |
# search per query
|
35 |
|
|
|
|
|
36 |
|
37 |
-
|
|
|
38 |
term = clean_query(term, clean=clean, strict=strict)
|
39 |
# heuristic, 2 searches strict and not? and then merge?
|
40 |
-
# https://api.scite.ai/search?mode=
|
41 |
-
|
|
|
|
|
|
|
42 |
req = requests.get(
|
43 |
search,
|
44 |
headers={
|
@@ -49,13 +55,18 @@ def search(term, limit=10, clean=True, strict=True):
|
|
49 |
req.json()
|
50 |
except:
|
51 |
return [], []
|
|
|
|
|
52 |
return (
|
53 |
-
|
54 |
[(doc['doi'], doc['citations'], doc['title'])
|
55 |
-
|
56 |
)
|
57 |
|
58 |
|
|
|
|
|
|
|
59 |
def find_source(text, docs):
|
60 |
for doc in docs:
|
61 |
if text in remove_html(doc[1][0]['snippet']):
|
@@ -150,6 +161,9 @@ st.markdown("""
|
|
150 |
""", unsafe_allow_html=True)
|
151 |
|
152 |
with st.expander("Settings (strictness, context limit, top hits)"):
|
|
|
|
|
|
|
153 |
strict_lenient_mix = st.radio(
|
154 |
"Type of strict+lenient combination: Fallback or Mix? If fallback, strict is run first then if the results are less than context_lim we also search lenient. Mix will search them both and let reranking sort em out",
|
155 |
('fallback', 'mix'))
|
@@ -178,9 +192,9 @@ def run_query(query):
|
|
178 |
# could also try fallback if there are no good answers by score...
|
179 |
limit = top_hits_limit or 100
|
180 |
context_limit = context_lim or 10
|
181 |
-
contexts_strict, orig_docs_strict = search(query, limit=limit, strict=True)
|
182 |
if strict_lenient_mix == 'fallback' and len(contexts_strict) < context_limit:
|
183 |
-
contexts_lenient, orig_docs_lenient = search(query, limit=limit, strict=False)
|
184 |
contexts = list(
|
185 |
set(contexts_strict + contexts_lenient)
|
186 |
)
|
|
|
33 |
# deduplicate
|
34 |
# search per query
|
35 |
|
36 |
+
# options are abstract search
|
37 |
+
# all search
|
38 |
|
39 |
+
|
40 |
+
def search(term, limit=10, clean=True, strict=True, abstracts=True):
|
41 |
term = clean_query(term, clean=clean, strict=strict)
|
42 |
# heuristic, 2 searches strict and not? and then merge?
|
43 |
+
# https://api.scite.ai/search?mode=all&term=unit%20testing%20software&limit=10&date_from=2000&date_to=2022&offset=0&supporting_from=1&contrasting_from=0&contrasting_to=0&user_slug=domenic-rosati-keW5&compute_aggregations=true
|
44 |
+
mode = 'all'
|
45 |
+
if not abstracts:
|
46 |
+
mode = 'citations'
|
47 |
+
search = f"https://api.scite.ai/search?mode={mode}&term={term}&limit={limit}&offset=0&user_slug=domenic-rosati-keW5&compute_aggregations=false"
|
48 |
req = requests.get(
|
49 |
search,
|
50 |
headers={
|
|
|
55 |
req.json()
|
56 |
except:
|
57 |
return [], []
|
58 |
+
|
59 |
+
citation_contexts = [remove_html('\n'.join([cite['snippet'] for cite in doc['citations']])) for doc in req.json()['hits']]
|
60 |
return (
|
61 |
+
citation_contexts,
|
62 |
[(doc['doi'], doc['citations'], doc['title'])
|
63 |
+
for doc in req.json()['hits']]
|
64 |
)
|
65 |
|
66 |
|
67 |
+
|
68 |
+
|
69 |
+
|
70 |
def find_source(text, docs):
|
71 |
for doc in docs:
|
72 |
if text in remove_html(doc[1][0]['snippet']):
|
|
|
161 |
""", unsafe_allow_html=True)
|
162 |
|
163 |
with st.expander("Settings (strictness, context limit, top hits)"):
|
164 |
+
support_abstracts = st.radio(
|
165 |
+
"Use abstracts as a ranking signal (if the words are matched in the abstract then the document is more relevant)?",
|
166 |
+
('yes', 'no'))
|
167 |
strict_lenient_mix = st.radio(
|
168 |
"Type of strict+lenient combination: Fallback or Mix? If fallback, strict is run first then if the results are less than context_lim we also search lenient. Mix will search them both and let reranking sort em out",
|
169 |
('fallback', 'mix'))
|
|
|
192 |
# could also try fallback if there are no good answers by score...
|
193 |
limit = top_hits_limit or 100
|
194 |
context_limit = context_lim or 10
|
195 |
+
contexts_strict, orig_docs_strict = search(query, limit=limit, strict=True, abstracts=support_abstracts == 'yes')
|
196 |
if strict_lenient_mix == 'fallback' and len(contexts_strict) < context_limit:
|
197 |
+
contexts_lenient, orig_docs_lenient = search(query, limit=limit, strict=False, abstracts=support_abstracts == 'yes')
|
198 |
contexts = list(
|
199 |
set(contexts_strict + contexts_lenient)
|
200 |
)
|