Spaces:

obss
/

question-answering-demo

Runtime error

App Files Files Community

secilozksen commited on Dec 16, 2022

Commit

bbe9860

1 Parent(s): 8c6cfa8

Upload 14 files

Browse files

streamlit updated

Files changed (9) hide show

DPR_pipeline.png +0 -0
README.md +17 -10
Retrieve-rerank-DPR.png +0 -0
Retrieve-rerank-trained-cross-encoder.png +0 -0
custom-dpr-context-embeddings.pkl +3 -0
demo_dpr.py +315 -0
environment.yml +31 -0
requirements.txt +7 -1
retrieve-rerank.png +0 -0

DPR_pipeline.png ADDED Viewed

README.md CHANGED Viewed

@@ -1,10 +1,17 @@
----
-title: QuestionAnsweringDemo
-emoji: ⚕
-colorFrom: gray
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.10.0
-app_file: demov2.py
-pinned: false
----

+# QuestionAnsweringDemo
+## Create the environment
+conda env create --file environment.yml
+conda activate QADemo
+After installing requirements, please make sure that you add huggingface authorization token to your ./.streamlit/secret.toml file.
+It should be something like:
+AUTH_TOKEN='your_auth_token_here'
+## Runing the app:
+streamlit run demov2.py

Retrieve-rerank-DPR.png ADDED Viewed

Retrieve-rerank-trained-cross-encoder.png ADDED Viewed

custom-dpr-context-embeddings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4370d7be3e126cfcab0d1cbffc11a44a0d7417a95a1201e35812974be5435955
+size 931607446

demo_dpr.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import copy
+import streamlit as st
+import pandas as pd
+from sentence_transformers import SentenceTransformer, util
+from sentence_transformers.cross_encoder import CrossEncoder
+from st_aggrid import GridOptionsBuilder, AgGrid
+import pickle
+import torch
+from transformers import DPRQuestionEncoderTokenizer, AutoModel
+from pathlib import Path
+import base64
+import regex
+import tokenizers
+st.set_page_config(layout="wide")
+DATAFRAME_FILE_ORIGINAL = 'policyQA_original.csv'
+DATAFRAME_FILE_BSBS = 'policyQA_bsbs_sentence.csv'
+selectbox_selections = {
+    'Retrieve - Rerank (with fine-tuned cross-encoder)': 1,
+    'Dense Passage Retrieval':2,
+    'Retrieve - Reranking with DPR':3,
+    'Retrieve - Rerank':4
+}
+imagebox_selections = {
+    'Retrieve - Rerank (with fine-tuned cross-encoder)': 'Retrieve-rerank-trained-cross-encoder.png',
+    'Dense Passage Retrieval': 'DPR_pipeline.png',
+    'Retrieve - Reranking with DPR': 'Retrieve-rerank-DPR.png',
+    'Retrieve - Rerank': 'retrieve-rerank.png'
+}
+def retrieve_rerank(question):
+    # Semantic Search (Retrieve)
+    question_embedding = bi_encoder.encode(question, convert_to_tensor=True)
+    hits = util.semantic_search(question_embedding, context_embeddings, top_k=100)
+    if len(hits) == 0:
+        return []
+    hits = hits[0]
+    # Rerank - score all retrieved passages with cross-encoder
+    cross_inp = [[question, contexes[hit['corpus_id']]] for hit in hits]
+    cross_scores = cross_encoder.predict(cross_inp)
+    # Sort results by the cross-encoder scores
+    for idx in range(len(cross_scores)):
+        hits[idx]['cross-score'] = cross_scores[idx]
+    # Output of top-5 hits from re-ranker
+    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    top_5_contexes = []
+    top_5_scores = []
+    for hit in hits[0:20]:
+        top_5_contexes.append(contexes[hit['corpus_id']])
+        top_5_scores.append(hit['cross-score'])
+    return top_5_contexes, top_5_scores
+@st.cache(show_spinner=False, allow_output_mutation=True)
+def load_paragraphs(path):
+    with open(path, "rb") as fIn:
+        cache_data = pickle.load(fIn)
+        corpus_sentences = cache_data['contexes']
+        corpus_embeddings = cache_data['embeddings']
+    return corpus_embeddings, corpus_sentences
+@st.cache(show_spinner=False)
+def load_dataframes():
+    data_original = pd.read_csv(DATAFRAME_FILE_ORIGINAL, index_col=0, sep='|')
+    data_bsbs = pd.read_csv(DATAFRAME_FILE_BSBS, index_col=0, sep='|')
+    data_original = data_original.sample(frac=1).reset_index(drop=True)
+    data_bsbs = data_bsbs.sample(frac=1).reset_index(drop=True)
+    return data_original, data_bsbs
+def dot_product(question_output, context_output):
+    mat1 = torch.unsqueeze(question_output, dim=1)
+    mat2 = torch.unsqueeze(context_output, dim=2)
+    result = torch.bmm(mat1, mat2)
+    result = torch.squeeze(result, dim=1)
+    result = torch.squeeze(result, dim=1)
+    return result
+def retrieve_rerank_DPR(question):
+    hits = retrieve_with_dpr_embeddings(question)
+    return rerank_with_DPR(hits, question)
+def DPR_reranking(question, selected_contexes, selected_embeddings):
+    scores = []
+    tokenized_question = question_tokenizer(question, padding=True, truncation=True, return_tensors="pt",
+                                            add_special_tokens=True)
+    question_output = dpr_trained.model.question_model(**tokenized_question)
+    question_output = question_output['pooler_output']
+    for context_embedding in selected_embeddings:
+        score = dot_product(question_output, context_embedding)
+        scores.append(score.detach().cpu())
+    scores_index = sorted(range(len(scores)), key=lambda x: scores[x], reverse=True)
+    contexes_list = []
+    scores_final = []
+    for i, idx in enumerate(scores_index[:5]):
+        scores_final.append(scores[idx])
+        contexes_list.append(selected_contexes[idx])
+    return scores_final, contexes_list
+def search_pipeline(question, search_method):
+    if search_method == 1: #Retrieve - rerank with fine-tuned cross encoder
+        return retrieve_rerank_with_trained_cross_encoder(question)
+    if search_method == 2:
+        return custom_dpr_pipeline(question) # DPR only
+    if search_method == 3:
+        return retrieve_rerank_DPR(question)
+    if search_method == 4:
+        return retrieve_rerank(question)
+def custom_dpr_pipeline(question):
+    #paragraphs
+    tokenized_question = question_tokenizer(question, padding=True, truncation=True, return_tensors="pt",
+                                            add_special_tokens=True)
+    question_embedding = dpr_trained.model.question_model(**tokenized_question)
+    question_embedding = question_embedding['pooler_output']
+    results_list = []
+    for i,context_embedding in enumerate(dpr_context_embeddings):
+        score = dot_product(question_embedding, context_embedding)
+        results_list.append(score.detach().cpu().numpy()[0])
+    hits = sorted(range(len(results_list)), key=lambda i: results_list[i], reverse=True)
+    top_5_contexes = []
+    top_5_scores = []
+    for j in hits[0:5]:
+        top_5_contexes.append(dpr_contexes[j])
+        top_5_scores.append(results_list[j])
+    return top_5_contexes, top_5_scores
+def retrieve(question, corpus_embeddings):
+    # Semantic Search (Retrieve)
+    question_embedding = bi_encoder.encode(question, convert_to_tensor=True)
+    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=100)
+    if len(hits) == 0:
+        return []
+    hits = hits[0]
+    return hits
+def retrieve_with_dpr_embeddings(question):
+    # Semantic Search (Retrieve)
+    question_tokens = question_tokenizer(question, padding=True, truncation=True, return_tensors="pt",
+                                            add_special_tokens=True)
+    question_embedding = dpr_trained.model.question_model(**question_tokens)['pooler_output']
+    question_embedding = torch.squeeze(question_embedding, dim=0)
+    corpus_embeddings = torch.stack(dpr_context_embeddings)
+    corpus_embeddings = torch.squeeze(corpus_embeddings, dim=1)
+    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=100)
+    if len(hits) == 0:
+        return []
+    hits = hits[0]
+    return hits
+def rerank_with_DPR(hits, question):
+    # Rerank - score all retrieved passages with cross-encoder
+    selected_contexes = [dpr_contexes[hit['corpus_id']] for hit in hits]
+    selected_embeddings = [dpr_context_embeddings[hit['corpus_id']] for hit in hits]
+    top_5_scores, top_5_contexes = DPR_reranking(question, selected_contexes, selected_embeddings)
+    return top_5_contexes, top_5_scores
+def DPR_reranking(question, selected_contexes, selected_embeddings):
+    scores = []
+    tokenized_question = question_tokenizer(question, padding=True, truncation=True, return_tensors="pt",
+                                            add_special_tokens=True)
+    question_output = dpr_trained.model.question_model(**tokenized_question)
+    question_output = question_output['pooler_output']
+    for context_embedding in selected_embeddings:
+        score = dot_product(question_output, context_embedding)
+        scores.append(score.detach().cpu().numpy()[0])
+    scores_index = sorted(range(len(scores)), key=lambda x: scores[x], reverse=True)
+    contexes_list = []
+    scores_final = []
+    for i, idx in enumerate(scores_index[:5]):
+        scores_final.append(scores[idx])
+        contexes_list.append(selected_contexes[idx])
+    return scores_final, contexes_list
+def retrieve_rerank_with_trained_cross_encoder(question):
+    hits = retrieve(question, context_embeddings)
+    cross_inp = [(question, contexes[hit['corpus_id']]) for hit in hits]
+    cross_scores = trained_cross_encoder.predict(cross_inp)
+    # Sort results by the cross-encoder scores
+    for idx in range(len(cross_scores)):
+        hits[idx]['cross-score'] = cross_scores[idx][0]
+    # Output of top-5 hits from re-ranker
+    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    top_5_contexes = []
+    top_5_scores = []
+    for hit in hits[0:5]:
+        top_5_contexes.append(contexes[hit['corpus_id']])
+        top_5_scores.append(hit['cross-score'])
+    return top_5_contexes, top_5_scores
+def interactive_table(dataframe):
+    gb = GridOptionsBuilder.from_dataframe(dataframe)
+    gb.configure_pagination(paginationAutoPageSize=True)
+    gb.configure_side_bar()
+    gb.configure_selection('single', rowMultiSelectWithClick=True,
+                           groupSelectsChildren="Group checkbox select children")  # Enable multi-row selection
+    gridOptions = gb.build()
+    grid_response = AgGrid(
+        dataframe,
+        gridOptions=gridOptions,
+        data_return_mode='AS_INPUT',
+        update_mode='SELECTION_CHANGED',
+        enable_enterprise_modules=False,
+        fit_columns_on_grid_load=False,
+        theme='streamlit',  # Add theme color to the table
+        height=350,
+        width='100%',
+        reload_data=False
+    )
+    return grid_response
+def img_to_bytes(img_path):
+    img_bytes = Path(img_path).read_bytes()
+    encoded = base64.b64encode(img_bytes).decode()
+    return encoded
+def qa_main_widgetsv2():
+    st.title("Question Answering Demo")
+    st.markdown("""---""")
+    option = st.selectbox("Select a search method:", list(selectbox_selections.keys()))
+    header_html = "<center> <img src='data:image/png;base64,{}' class='img-fluid' width='60%', height='40%'> </center>".format(
+        img_to_bytes(imagebox_selections[option])
+    )
+    st.markdown(
+        header_html, unsafe_allow_html=True,
+    )
+    st.markdown("""---""")
+    col1, col2, col3 = st.columns([2, 1, 1])
+    with col1:
+        form = st.form(key='first_form')
+        question = form.text_area("What is your question?:", height=200)
+        submit = form.form_submit_button('Submit')
+        if "form_submit" not in st.session_state:
+            st.session_state.form_submit = False
+        if submit:
+            st.session_state.form_submit = True
+        if st.session_state.form_submit and question != '':
+            with st.spinner(text='Related context search in progress..'):
+                top_5_contexes, top_5_scores = search_pipeline(question.strip(), selectbox_selections[option])
+            if len(top_5_contexes) == 0:
+                st.error("Related context not found!")
+                st.session_state.form_submit = False
+            else:
+                for i, context in enumerate(top_5_contexes):
+                    st.markdown(f"## Related Context - {i + 1} (score: {top_5_scores[i]:.2f})")
+                    st.markdown(context)
+                    st.markdown("""---""")
+    with col2:
+        st.markdown("## Original Questions")
+        grid_response = interactive_table(dataframe_original)
+        data1 = grid_response['selected_rows']
+        if "grid_click_1" not in st.session_state:
+            st.session_state.grid_click_1 = False
+        if len(data1) > 0:
+            st.session_state.grid_click_1 = True
+        if st.session_state.grid_click_1:
+            selection = data1[0]
+            #   st.markdown("## Context & Answer:")
+            st.markdown("### Context:")
+            st.write(selection['context'])
+            st.markdown("### Question:")
+            st.write(selection['question'])
+            st.markdown("### Answer:")
+            st.write(selection['answer'])
+            st.session_state.grid_click_1 = False
+    with col3:
+        st.markdown("## Our Questions")
+        grid_response = interactive_table(dataframe_bsbs)
+        data2 = grid_response['selected_rows']
+        if "grid_click_2" not in st.session_state:
+            st.session_state.grid_click_2 = False
+        if len(data2) > 0:
+            st.session_state.grid_click_2 = True
+        if st.session_state.grid_click_2:
+            selection = data2[0]
+            #   st.markdown("## Context & Answer:")
+            st.markdown("### Context:")
+            st.write(selection['context'])
+            st.markdown("### Question:")
+            st.write(selection['question'])
+            st.markdown("### Answer:")
+            st.write(selection['answer'])
+            st.session_state.grid_click_2 = False
+@st.cache(show_spinner=False, allow_output_mutation = True)
+def load_models(dpr_model_path, auth_token, cross_encoder_model_path):
+    dpr_trained = AutoModel.from_pretrained(dpr_model_path, use_auth_token=auth_token,
+                                            trust_remote_code=True)
+    bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
+    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+    bi_encoder.max_seq_length = 500
+    trained_cross_encoder = CrossEncoder(cross_encoder_model_path)
+    question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
+    return dpr_trained, bi_encoder, cross_encoder, trained_cross_encoder, question_tokenizer
+context_embeddings, contexes = load_paragraphs('context-embeddings.pkl')
+dpr_context_embeddings, dpr_contexes = load_paragraphs('custom-dpr-context-embeddings.pkl')
+dataframe_original, dataframe_bsbs = load_dataframes()
+dpr_trained, bi_encoder, cross_encoder, trained_cross_encoder, question_tokenizer = copy.deepcopy(load_models(st.secrets["DPR_MODEL_PATH"], st.secrets["AUTH_TOKEN"], st.secrets["CROSS_ENCODER_MODEL_PATH"]))
+qa_main_widgetsv2()

environment.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: QADemo
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - cudatoolkit=11.6.0
+  - numpy-base=1.23.1
+  - pip=22.2.2
+  - python=3.10.6
+  - pytorch=1.12.1
+  - torchaudio=0.12.1
+  - torchvision=0.13.1
+  - pip:
+    - en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
+    - huggingface-hub==0.10.0
+    - nltk==3.7
+    - numpy==1.23.3
+    - pandas==1.5.0
+    - scikit-learn==1.1.2
+    - scipy==1.9.2
+    - sentence-transformers==2.2.2
+    - spacy==3.2.0
+    - sentencepiece==0.1.97
+    - streamlit==1.13.0
+    - streamlit-aggrid==0.3.3
+    - tokenizers==0.12.1
+    - toml==0.10.2
+    - toolz==0.12.0
+    - tqdm==4.64.1
+    - transformers==4.22.2

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 argon2-cffi==21.3.0
 argon2-cffi-bindings==21.2.0
 asttokens==2.0.5
@@ -20,6 +21,7 @@ cymem==2.0.7
 debugpy==1.6.0
 decorator==5.1.1
 defusedxml==0.7.1
 entrypoints==0.4
 executing==0.8.3
 fastjsonschema==2.15.3
@@ -47,7 +49,12 @@ jupyterlab-widgets==1.1.1
 kiwisolver==1.4.3
 langcodes==3.3.0
 MarkupSafe==2.1.1
 mistune==0.8.4
 mpmath==1.2.1
 murmurhash==1.0.9
 nbclient==0.6.4
@@ -106,7 +113,6 @@ six==1.16.0
 smart-open==5.2.1
 smmap==5.0.0
 soupsieve==2.3.2.post1
-en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
 spacy==3.2.0
 spacy-legacy==3.0.10
 spacy-loggers==1.0.3

+altair==4.2.0
 argon2-cffi==21.3.0
 argon2-cffi-bindings==21.2.0
 asttokens==2.0.5
 debugpy==1.6.0
 decorator==5.1.1
 defusedxml==0.7.1
+en-core-web-sm==3.2.0
 entrypoints==0.4
 executing==0.8.3
 fastjsonschema==2.15.3
 kiwisolver==1.4.3
 langcodes==3.3.0
 MarkupSafe==2.1.1
+matplotlib==3.5.2
+matplotlib-inline==0.1.3
 mistune==0.8.4
+mkl-fft==1.3.1
+mkl-random==1.2.2
+mkl-service==2.4.0
 mpmath==1.2.1
 murmurhash==1.0.9
 nbclient==0.6.4
 smart-open==5.2.1
 smmap==5.0.0
 soupsieve==2.3.2.post1
 spacy==3.2.0
 spacy-legacy==3.0.10
 spacy-loggers==1.0.3

retrieve-rerank.png ADDED Viewed