Spaces:

domenicrosati
/

scite-qa-demo

Runtime error

App Files Files Community

domenicrosati commited on Sep 20, 2022

Commit

5ed186b

1 Parent(s): a776895

support abstracts in QA

Browse files

Files changed (1) hide show

app.py +65 -28

app.py CHANGED Viewed

@@ -37,30 +37,51 @@ def remove_html(x):
 # all search
-def search(term, limit=10, clean=True, strict=True, abstracts=True):
     term = clean_query(term, clean=clean, strict=strict)
     # heuristic, 2 searches strict and not? and then merge?
     # https://api.scite.ai/search?mode=all&term=unit%20testing%20software&limit=10&date_from=2000&date_to=2022&offset=0&supporting_from=1&contrasting_from=0&contrasting_to=0&user_slug=domenic-rosati-keW5&compute_aggregations=true
-    mode = 'all'
-    if not abstracts:
-        mode = 'citations'
-    search = f"https://api.scite.ai/search?mode={mode}&term={term}&limit={limit}&offset=0&user_slug=domenic-rosati-keW5&compute_aggregations=false"
-    req = requests.get(
-        search,
-        headers={
-            'Authorization': f'Bearer {SCITE_API_KEY}'
-        }
-    )
-    try:
-        req.json()
-    except:
-        return [], []
-    citation_contexts = [remove_html('\n'.join([cite['snippet'] for cite in doc['citations']])) for doc in req.json()['hits']]
     return (
-        citation_contexts,
-        [(doc['doi'], doc['citations'], doc['title'])
-        for doc in req.json()['hits']]
     )
@@ -69,15 +90,28 @@ def find_source(text, docs):
         for snippet in doc[1]:
             if text in remove_html(snippet.get('snippet', '')):
                 new_text = text
-                for snip in remove_html(snippet.get('snippet', '')).split('.'):
-                    if text in snip:
-                        new_text = snip
                 return {
                     'citation_statement': snippet['snippet'].replace('<strong class="highlight">', '').replace('</strong>', ''),
                     'text': new_text,
                     'from': snippet['source'],
                     'supporting': snippet['target'],
-                    'source_title': doc[2],
                     'source_link': f"https://scite.ai/reports/{doc[0]}"
                 }
     return None
@@ -159,9 +193,12 @@ st.markdown("""
 """, unsafe_allow_html=True)
 with st.expander("Settings (strictness, context limit, top hits)"):
-    support_abstracts = st.radio(
-        "Use abstracts as a ranking signal (if the words are matched in the abstract then the document is more relevant)?",
         ('yes', 'no'))
     strict_lenient_mix = st.radio(
         "Type of strict+lenient combination: Fallback or Mix? If fallback, strict is run first then if the results are less than context_lim we also search lenient. Mix will search them both and let reranking sort em out",
         ('fallback', 'mix'))
@@ -170,7 +207,7 @@ with st.expander("Settings (strictness, context limit, top hits)"):
         "Use Reranking? Reranking will rerank the top hits using semantic similarity of document and query.",
         ('yes', 'no'))
     top_hits_limit = st.slider('Top hits? How many documents to use for reranking. Larger is slower but higher quality', 10, 300, 100)
-    context_lim = st.slider('Context limit? How many documents to use for answering from. Larger is slower but higher quality', 10, 300, 10)
 # def paraphrase(text, max_length=128):
 #     input_ids = queryexp_tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
@@ -190,9 +227,9 @@ def run_query(query):
     # could also try fallback if there are no good answers by score...
     limit = top_hits_limit or 100
     context_limit = context_lim or 10
-    contexts_strict, orig_docs_strict = search(query, limit=limit, strict=True, abstracts=support_abstracts == 'yes')
     if strict_lenient_mix == 'fallback' and len(contexts_strict) < context_limit:
-        contexts_lenient, orig_docs_lenient = search(query, limit=limit, strict=False, abstracts=support_abstracts == 'yes')
         contexts = list(
             set(contexts_strict + contexts_lenient)
         )

 # all search
+def search(term, limit=10, clean=True, strict=True, all_mode=True, abstracts=True, abstract_only=False):
     term = clean_query(term, clean=clean, strict=strict)
     # heuristic, 2 searches strict and not? and then merge?
     # https://api.scite.ai/search?mode=all&term=unit%20testing%20software&limit=10&date_from=2000&date_to=2022&offset=0&supporting_from=1&contrasting_from=0&contrasting_to=0&user_slug=domenic-rosati-keW5&compute_aggregations=true
+    contexts, docs = [], []
+    if not abstract_only:
+        mode = 'all'
+        if not all_mode:
+            mode = 'citations'
+        search = f"https://api.scite.ai/search?mode={mode}&term={term}&limit={limit}&offset=0&user_slug=domenic-rosati-keW5&compute_aggregations=false"
+        req = requests.get(
+            search,
+            headers={
+                'Authorization': f'Bearer {SCITE_API_KEY}'
+            }
+        )
+        try:
+            req.json()
+        except:
+            pass
+        contexts += [remove_html('\n'.join([cite['snippet'] for cite in doc['citations']])) for doc in req.json()['hits']]
+        docs += [(doc['doi'], doc['citations'], doc['title'], doc['abstract'] or '')
+            for doc in req.json()['hits']]
+    if abstracts or abstract_only:
+        search = f"https://api.scite.ai/search?mode=papers&abstract={term}&limit={limit}&offset=0&user_slug=domenic-rosati-keW5&compute_aggregations=false"
+        req = requests.get(
+            search,
+            headers={
+                'Authorization': f'Bearer {SCITE_API_KEY}'
+            }
+        )
+        try:
+            req.json()
+            contexts += [remove_html(doc['abstract'] or '') for doc in req.json()['hits']]
+            docs += [(doc['doi'], doc['citations'], doc['title'], doc['abstract'] or '')
+                        for doc in req.json()['hits']]
+        except:
+            pass
     return (
+        contexts,
+        docs
     )
         for snippet in doc[1]:
             if text in remove_html(snippet.get('snippet', '')):
                 new_text = text
+                for sent in remove_html(snippet.get('snippet', '')).split('.'):
+                    if text in sent:
+                        new_text = sent
                 return {
                     'citation_statement': snippet['snippet'].replace('<strong class="highlight">', '').replace('</strong>', ''),
                     'text': new_text,
                     'from': snippet['source'],
                     'supporting': snippet['target'],
+                    'source_title': remove_html(doc[2]),
+                    'source_link': f"https://scite.ai/reports/{doc[0]}"
+                }
+        if text in remove_html(doc[3]):
+            new_text = text
+            for sent in remove_html(doc[3]).split('.'):
+                if text in sent:
+                    new_text = sent
+            return {
+                    'citation_statement': "ABSTRACT: " + remove_html(doc[3]).replace('<strong class="highlight">', '').replace('</strong>', ''),
+                    'text': new_text,
+                    'from': '...',
+                    'supporting': '...',
+                    'source_title': "ABSTRACT of " + remove_html(doc[2]),
                     'source_link': f"https://scite.ai/reports/{doc[0]}"
                 }
     return None
 """, unsafe_allow_html=True)
 with st.expander("Settings (strictness, context limit, top hits)"):
+    support_all = st.radio(
+        "Use abstracts and titles as a ranking signal (if the words are matched in the abstract then the document is more relevant)?",
         ('yes', 'no'))
+    support_abstracts = st.radio(
+        "Use abstracts as a source document?",
+        ('yes', 'no', 'abstract only'))
     strict_lenient_mix = st.radio(
         "Type of strict+lenient combination: Fallback or Mix? If fallback, strict is run first then if the results are less than context_lim we also search lenient. Mix will search them both and let reranking sort em out",
         ('fallback', 'mix'))
         "Use Reranking? Reranking will rerank the top hits using semantic similarity of document and query.",
         ('yes', 'no'))
     top_hits_limit = st.slider('Top hits? How many documents to use for reranking. Larger is slower but higher quality', 10, 300, 100)
+    context_lim = st.slider('Context limit? How many documents to use for answering from. Larger is slower but higher quality', 10, 300, 25)
 # def paraphrase(text, max_length=128):
 #     input_ids = queryexp_tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
     # could also try fallback if there are no good answers by score...
     limit = top_hits_limit or 100
     context_limit = context_lim or 10
+    contexts_strict, orig_docs_strict = search(query, limit=limit, strict=True, all_mode=support_all == 'yes', abstracts= support_abstracts == 'yes', abstract_only=support_abstracts == 'abstract only')
     if strict_lenient_mix == 'fallback' and len(contexts_strict) < context_limit:
+        contexts_lenient, orig_docs_lenient = search(query, limit=limit, strict=False, all_mode=support_all == 'yes',  abstracts= support_abstracts == 'yes', abstract_only= support_abstracts == 'abstract only')
         contexts = list(
             set(contexts_strict + contexts_lenient)
         )