import streamlit as st
from PyPDF2 import PdfReader
from anthropic import Anthropic
from prompts import DIFFERENTIATE_PROMPT
def extract_differences(input_text):
input_text = input_text.strip()
qa_pairs = input_text.split('')
# Initialize an empty list to hold the parsed dictionary objects
parsed_data = []
# Iterate over each QA pair
for pair in qa_pairs:
# Check if the pair has both question and answer (ignoring the last one)
if '' in pair and '' in pair and '' in pair and '' in pair and '' in pair and '' in pair:
# Extract the question and answer text
text1 = pair.split('')[1].split('')[0]
text2 = pair.split('')[1].split('')[0]
explanation = pair.split('')[1].split('')[0]
# Create a dictionary for the current pair and append it to the list
parsed_data.append({'text1': text1.strip(), 'text2': text2.strip(), 'explanation': explanation.strip()})
return parsed_data
def make_llm_api_call(prompt):
client = Anthropic()
message = client.messages.create(
model="claude-3-haiku-20240307",
max_tokens=4096,
temperature=0,
messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
)
return message
def get_llm_response(extractedtext1, extractedtext2):
prompt = DIFFERENTIATE_PROMPT.format(text1=extractedtext1, text2=extractedtext2)
message = make_llm_api_call(prompt)
message_text = message.content[0].text
try:
try:
differences_list = extract_differences(message_text.split("")[1].split("")[0].strip())
except Exception as e:
differences_list = extract_differences(message_text)
except Exception as e:
print("Error:", e)
return message_text, []
# display_text = f"{before_differences}\n\n{difference_content}\n\n{after_differences}"
return differences_list
def extract_text_with_pypdf(pdf_path):
reader = PdfReader(pdf_path)
text = []
for page in reader.pages:
text.append(page.extract_text())
return text
def main():
st.set_page_config(layout="wide") # Enable wide layout
if "differences_data" not in st.session_state:
st.session_state.differences_data = []
if "display_data" not in st.session_state:
st.session_state.display_data = {"file1": None, "file2": None, "i": 0}
if "file1" not in st.session_state:
st.session_state.file1 = None
if "file2" not in st.session_state:
st.session_state.file2 = None
if "extracted_texts" not in st.session_state:
st.session_state.extracted_texts = {"file1": None, "file2": None, "extracted_text_1": [], "extracted_text_2": []}
st.markdown('
' + '
PDF Upload and Compare App
' + '', unsafe_allow_html=True)
# Create columns for side-by-side buttons
col1, col2 = st.columns([2, 2])
# Add upload button to left column
st.session_state.file1 = col1.file_uploader("**PDF 1**", type="pdf")
# Add upload button to right column
st.session_state.file2 = col2.file_uploader("**PDF 2**", type="pdf")
# Check if both files are uploaded
if st.session_state.file1 and st.session_state.file2:
# Get filenames from uploaded files
filename1 = st.session_state.file1.name
filename2 = st.session_state.file2.name
with st.spinner("Extracting text from PDFs"):
if st.session_state.display_data["file1"] != st.session_state.file1 or st.session_state.display_data["file2"] != st.session_state.file2:
st.session_state.display_data = {"file1": st.session_state.file1, "file2": st.session_state.file2, "i": 0}
st.session_state.extracted_texts = {"file1": st.session_state.display_data["file1"], "file2": st.session_state.display_data["file2"], "extracted_text_1": extract_text_with_pypdf(st.session_state.file1), "extracted_text_2": extract_text_with_pypdf(st.session_state.file2)}
try:
extracted_text1 = st.session_state.extracted_texts["extracted_text_1"]
extracted_text2 = st.session_state.extracted_texts["extracted_text_2"]
with col1.expander(filename1):
st.write("\n\n".join("\n\n------------------------\n\n".join(extracted_text1).splitlines()))
with col2.expander(filename2):
st.write("\n\n".join("\n\n------------------------\n\n".join(extracted_text2).splitlines()))
st.success(f"PDF text extraction complete")
except Exception as e:
st.error(f"Error saving files: {str(e)}")
with st.spinner("Processing Pages within the PDFS"):
try:
# display_text, parsed_data = process_concurrently(extracted_text1, extracted_text2)
# display_text, parsed_data = get_llm_response(extracted_text1, extracted_text2)
for i,(etext1, etext2) in enumerate(zip(extracted_text1, extracted_text2)):
if i >= st.session_state.display_data["i"]: break
data = next((d for d in st.session_state.differences_data if d['etext1'] == etext1 and d['etext2'] == etext2), None)
if data:
pdata = data['pdata']
dext1 = data['dext1']
dext2 = data['dext2']
else:
pdata = get_llm_response(etext1, etext2)
dext1 = etext1
dext2 = etext2
for diff in pdata:
diff_text1 = diff['text1'].strip()
diff_text2 = diff['text2'].strip()
if diff_text1 == "" or diff_text2 == "": continue
diff_text1_phrase = "\n".join([f"{t}" for t in diff_text1.splitlines()])
diff_text2_phrase = "\n".join([f"{t}" for t in diff_text2.splitlines()])
dext1 = diff_text1_phrase.join(dext1.split(diff_text1)) if diff_text1 in dext1 else dext1
dext2 = diff_text2_phrase.join(dext2.split(diff_text2)) if diff_text2 in dext2 else dext2
st.session_state.differences_data.append({"etext1": etext1, "etext2": etext2, "pdata": pdata, "dext1": dext1, "dext2": dext2})
reverse_pdata = [{'text1': d['text2'], 'text2': d['text1'], 'explanation': d['explanation']} for d in pdata]
st.session_state.differences_data.append({"etext1": etext2, "etext2": etext1, "pdata": reverse_pdata, "dext1": dext2, "dext2": dext1})
display_text = "\n\n\n".join([f"**PDF 1:**\n\n{d['text1']}\n\n**PDF 2:**\n\n{d['text2']}\n\n**Explanation:**\n\n{d['explanation']}\n\n----------------------\n" for d in pdata])
with st.expander(f"**Page {i+1}** - {filename1}"):
st.markdown("\n\n".join(dext1.splitlines()), unsafe_allow_html=True)
with st.expander(f"**Page {i+1}** - {filename2}"):
st.markdown("\n\n".join(dext2.splitlines()), unsafe_allow_html=True)
st.markdown(display_text)
except Exception as e:
st.error(f"Error finding differences: {str(e)}")
# Add button at the bottom to run Find Differences function
if st.button("Find Differences"):
st.session_state.display_data["i"] = st.session_state.display_data["i"] + 5
st.rerun()
if __name__ == "__main__":
main()