Spaces:

domenicrosati
/

scite-qa-demo

Runtime error

App Files Files Community

domenicrosati commited on Sep 20, 2022

Commit

b7e15be

1 Parent(s): a14da38

improve tokenization

Browse files

Files changed (1) hide show

app.py +32 -13

app.py CHANGED Viewed

@@ -90,7 +90,7 @@ def find_source(text, docs):
         for snippet in doc[1]:
             if text in remove_html(snippet.get('snippet', '')):
                 new_text = text
-                for sent in remove_html(snippet.get('snippet', '')).split('.'):
                     if text in sent:
                         new_text = sent
                 return {
@@ -103,7 +103,7 @@ def find_source(text, docs):
                 }
         if text in remove_html(doc[3]):
             new_text = text
-            for sent in remove_html(doc[3]).split('.'):
                 if text in sent:
                     new_text = sent
             return {
@@ -206,8 +206,8 @@ with st.expander("Settings (strictness, context limit, top hits)"):
     use_reranking = st.radio(
         "Use Reranking? Reranking will rerank the top hits using semantic similarity of document and query.",
         ('yes', 'no'))
-    top_hits_limit = st.slider('Top hits? How many documents to use for reranking. Larger is slower but higher quality', 10, 300, 100)
-    context_lim = st.slider('Context limit? How many documents to use for answering from. Larger is slower but higher quality', 10, 300, 25)
 # def paraphrase(text, max_length=128):
 #     input_ids = queryexp_tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
@@ -216,6 +216,22 @@ with st.expander("Settings (strictness, context limit, top hits)"):
 #     preds = '\n * '.join(queries)
 #     return preds
 def run_query(query):
 #     if use_query_exp == 'yes':
 #         query_exp = paraphrase(f"question2question: {query}")
@@ -224,6 +240,10 @@ def run_query(query):
 # * {query_exp}
 # """)
     # could also try fallback if there are no good answers by score...
     limit = top_hits_limit or 100
     context_limit = context_lim or 10
@@ -280,12 +300,9 @@ def run_query(query):
             "score": result['score'],
             "doi": support["supporting"]
         })
-    sorted_result = sorted(results, key=lambda x: x['score'])
-    sorted_result = list({
-        result['context']: result for result in sorted_result
-    }.values())
-    sorted_result = sorted(
-        sorted_result, key=lambda x: x['score'], reverse=True)
     if confidence_threshold == 0:
         threshold = 0
@@ -299,9 +316,11 @@ def run_query(query):
     for r in sorted_result:
         answer = r["answer"]
-        ctx = remove_html(r["context"]).replace(answer, f"<mark>{answer}</mark>").replace(
-            '<cite', '<a').replace('</cite', '</a').replace('data-doi="', 'href="https://scite.ai/reports/')
-        title = r.get("title", '').replace("_", " ")
         score = round(r["score"], 4)
         card(title, ctx, score, r['link'], r['doi'])

         for snippet in doc[1]:
             if text in remove_html(snippet.get('snippet', '')):
                 new_text = text
+                for sent in nltk.sent_tokenize(remove_html(snippet.get('snippet', ''))):
                     if text in sent:
                         new_text = sent
                 return {
                 }
         if text in remove_html(doc[3]):
             new_text = text
+            for sent in nltk.sent_tokenize(remove_html(doc[3])):
                 if text in sent:
                     new_text = sent
             return {
     use_reranking = st.radio(
         "Use Reranking? Reranking will rerank the top hits using semantic similarity of document and query.",
         ('yes', 'no'))
+    top_hits_limit = st.slider('Top hits? How many documents to use for reranking. Larger is slower but higher quality', 10, 300, 10)
+    context_lim = st.slider('Context limit? How many documents to use for answering from. Larger is slower but higher quality', 10, 300, 5)
 # def paraphrase(text, max_length=128):
 #     input_ids = queryexp_tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
 #     preds = '\n * '.join(queries)
 #     return preds
+def group_results_by_context(results):
+    result_groups = {}
+    for result in results:
+        if result['context'] not in result_groups:
+            result_groups[result['context']] = result
+            result_groups[result['context']]['texts'] = []
+        result_groups[result['context']]['texts'].append(
+            result['answer']
+        )
+        if result['score'] > result_groups[result['context']]['score']:
+            result_groups[result['context']]['score'] = result['score']
+    return list(result_groups.values())
 def run_query(query):
 #     if use_query_exp == 'yes':
 #         query_exp = paraphrase(f"question2question: {query}")
 # * {query_exp}
 # """)
+    # address period in highlitht avoidability. Risk factors
+    # address poor tokenization Deletions involving chromosome region 4p16.3 cause WolfHirschhorn syndrome (WHS, OMIM 194190) [Battaglia et al, 2001].
+    # address highlight html
     # could also try fallback if there are no good answers by score...
     limit = top_hits_limit or 100
     context_limit = context_lim or 10
             "score": result['score'],
             "doi": support["supporting"]
         })
+    grouped_results = group_results_by_context(results)
+    sorted_result = sorted(grouped_results, key=lambda x: x['score'], reverse=True)
     if confidence_threshold == 0:
         threshold = 0
     for r in sorted_result:
         answer = r["answer"]
+        ctx = remove_html(r["context"])
+        for answer in r['texts']:
+            ctx = ctx.replace(answer, f"<mark>{answer}</mark>")
+        # .replace( '<cite', '<a').replace('</cite', '</a').replace('data-doi="', 'href="https://scite.ai/reports/')
+        title = r.get("title", '')
         score = round(r["score"], 4)
         card(title, ctx, score, r['link'], r['doi'])