import streamlit as st from PyPDF2 import PdfReader from anthropic import Anthropic from prompts import DIFFERENTIATE_PROMPT def extract_differences(input_text): input_text = input_text.strip() qa_pairs = input_text.split('') # Initialize an empty list to hold the parsed dictionary objects parsed_data = [] # Iterate over each QA pair for pair in qa_pairs: # Check if the pair has both question and answer (ignoring the last one) if '' in pair and '' in pair and '' in pair and '' in pair and '' in pair and '' in pair: # Extract the question and answer text text1 = pair.split('')[1].split('')[0] text2 = pair.split('')[1].split('')[0] explanation = pair.split('')[1].split('')[0] # Create a dictionary for the current pair and append it to the list parsed_data.append({'text1': text1.strip(), 'text2': text2.strip(), 'explanation': explanation.strip()}) return parsed_data def make_llm_api_call(prompt): client = Anthropic() message = client.messages.create( model="claude-3-haiku-20240307", max_tokens=4096, temperature=0, messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}], ) return message def get_llm_response(extractedtext1, extractedtext2): prompt = DIFFERENTIATE_PROMPT.format(text1=extractedtext1, text2=extractedtext2) message = make_llm_api_call(prompt) message_text = message.content[0].text try: try: differences_list = extract_differences(message_text.split("")[1].split("")[0].strip()) except Exception as e: differences_list = extract_differences(message_text) except Exception as e: print("Error:", e) return message_text, [] # display_text = f"{before_differences}\n\n{difference_content}\n\n{after_differences}" return differences_list def extract_text_with_pypdf(pdf_path): reader = PdfReader(pdf_path) text = [] for page in reader.pages: text.append(page.extract_text()) return text def main(): st.set_page_config(layout="wide") # Enable wide layout if "differences_data" not in st.session_state: st.session_state.differences_data = [] if "display_data" not in st.session_state: st.session_state.display_data = {"file1": None, "file2": None, "i": 0} if "file1" not in st.session_state: st.session_state.file1 = None if "file2" not in st.session_state: st.session_state.file2 = None if "extracted_texts" not in st.session_state: st.session_state.extracted_texts = {"file1": None, "file2": None, "extracted_text_1": [], "extracted_text_2": []} st.markdown('
' + '

PDF Upload and Compare App

' + '
', unsafe_allow_html=True) # Create columns for side-by-side buttons col1, col2 = st.columns([2, 2]) # Add upload button to left column st.session_state.file1 = col1.file_uploader("**PDF 1**", type="pdf") # Add upload button to right column st.session_state.file2 = col2.file_uploader("**PDF 2**", type="pdf") # Check if both files are uploaded if st.session_state.file1 and st.session_state.file2: # Get filenames from uploaded files filename1 = st.session_state.file1.name filename2 = st.session_state.file2.name with st.spinner("Extracting text from PDFs"): if st.session_state.display_data["file1"] != st.session_state.file1 or st.session_state.display_data["file2"] != st.session_state.file2: st.session_state.display_data = {"file1": st.session_state.file1, "file2": st.session_state.file2, "i": 0} st.session_state.extracted_texts = {"file1": st.session_state.display_data["file1"], "file2": st.session_state.display_data["file2"], "extracted_text_1": extract_text_with_pypdf(st.session_state.file1), "extracted_text_2": extract_text_with_pypdf(st.session_state.file2)} try: extracted_text1 = st.session_state.extracted_texts["extracted_text_1"] extracted_text2 = st.session_state.extracted_texts["extracted_text_2"] with col1.expander(filename1): st.write("\n\n".join("\n\n------------------------\n\n".join(extracted_text1).splitlines())) with col2.expander(filename2): st.write("\n\n".join("\n\n------------------------\n\n".join(extracted_text2).splitlines())) st.success(f"PDF text extraction complete") except Exception as e: st.error(f"Error saving files: {str(e)}") with st.spinner("Processing Pages within the PDFS"): try: # display_text, parsed_data = process_concurrently(extracted_text1, extracted_text2) # display_text, parsed_data = get_llm_response(extracted_text1, extracted_text2) for i,(etext1, etext2) in enumerate(zip(extracted_text1, extracted_text2)): if i >= st.session_state.display_data["i"]: break data = next((d for d in st.session_state.differences_data if d['etext1'] == etext1 and d['etext2'] == etext2), None) if data: pdata = data['pdata'] dext1 = data['dext1'] dext2 = data['dext2'] else: pdata = get_llm_response(etext1, etext2) dext1 = etext1 dext2 = etext2 for diff in pdata: diff_text1 = diff['text1'].strip() diff_text2 = diff['text2'].strip() if diff_text1 == "" or diff_text2 == "": continue diff_text1_phrase = "\n".join([f"{t}" for t in diff_text1.splitlines()]) diff_text2_phrase = "\n".join([f"{t}" for t in diff_text2.splitlines()]) dext1 = diff_text1_phrase.join(dext1.split(diff_text1)) if diff_text1 in dext1 else dext1 dext2 = diff_text2_phrase.join(dext2.split(diff_text2)) if diff_text2 in dext2 else dext2 st.session_state.differences_data.append({"etext1": etext1, "etext2": etext2, "pdata": pdata, "dext1": dext1, "dext2": dext2}) reverse_pdata = [{'text1': d['text2'], 'text2': d['text1'], 'explanation': d['explanation']} for d in pdata] st.session_state.differences_data.append({"etext1": etext2, "etext2": etext1, "pdata": reverse_pdata, "dext1": dext2, "dext2": dext1}) display_text = "\n\n\n".join([f"**PDF 1:**\n\n{d['text1']}\n\n**PDF 2:**\n\n{d['text2']}\n\n**Explanation:**\n\n{d['explanation']}\n\n----------------------\n" for d in pdata]) with st.expander(f"**Page {i+1}** - {filename1}"): st.markdown("\n\n".join(dext1.splitlines()), unsafe_allow_html=True) with st.expander(f"**Page {i+1}** - {filename2}"): st.markdown("\n\n".join(dext2.splitlines()), unsafe_allow_html=True) st.markdown(display_text) except Exception as e: st.error(f"Error finding differences: {str(e)}") # Add button at the bottom to run Find Differences function if st.button("Find Differences"): st.session_state.display_data["i"] = st.session_state.display_data["i"] + 5 st.rerun() if __name__ == "__main__": main()