Spaces:

DoazInc
/

ExplainDifference

Sleeping

File size: 7,384 Bytes

11873da

import fitz
from PyPDF2 import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from anthropic import Anthropic
from prompts import INFORMATION_EXTRACTION_PROMPT, INFORMATION_EXTRACTION_TAG_FORMAT, verify_INFORMATION_EXTRACTION_PROMPT, extract_INFORMATION_EXTRACTION_PROMPT
from prompts import verify_all_tags_present
from prompts import COMPARISON_INPUT_FORMAT, COMPARISON_PROMPT, COMPARISON_TAG_FORMAT, verify_COMPARISON_PROMPT, extract_COMPARISON_PROMPT
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import streamlit as st
from dotenv import load_dotenv
load_dotenv()

def make_llm_api_call(messages):
    print("Making LLM api call")
    client = Anthropic()
    message = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=4096,
        temperature=0,
        messages=messages,
    )
    print("LLM response received")
    return message

def loop_verify_format(answer_text, tag_format, messages, verify_func,root_tag):
    i = 0
    while not verify_func(answer_text):
        print("Wrong format")
        assistant_message = {"role": "assistant", "content": [{"type":"text", "text":answer_text}]}
        corrective_message = {"role":"user", "content":[{"type": "text", "text": f"You did not provide your answer in the correct format. Please provide your answer in the following format:\n{tag_format}"}]}
        messages.append(assistant_message)
        messages.append(corrective_message)
        message = make_llm_api_call(messages)
        message_text = message.content[0].text
        answer_text = f"<{root_tag}>\n{message_text.split(f'<{root_tag}>')[1].split(f'</{root_tag}>')[0].strip()}\n</{root_tag}>"
        if i > 3:
            raise Exception(f"LLM failed to provide a valid answer in {i-1} attempts")
    return answer_text

def loop_verify_all_tags_present(answer_text, tags, user_message, tag_format, verify_func, root_tag):
    missing_tags, _ = verify_all_tags_present(answer_text, tags)
    if missing_tags:
        print("There are missing tags", missing_tags)
        assistant_message = {"role":"assistant", "content":[{"type":"text", "text":answer_text}]}
        corrective_message = [{"role":"user", "content":[{"type":"text", "text":("In your response, the following tags are missing:\n" + "\n".join([f"<tag>{tag}</tag>" for tag in missing_tags]) + "\n\nPlease add information about the above missing tags and give a complete correct response.")}]}]
        messages = [user_message, assistant_message, corrective_message]
        message = make_llm_api_call(messages)
        message_text = message.content[0].text
        answer_text = f"<{root_tag}>\n{message_text.split(f'<{root_tag}>')[1].split(f'</{root_tag}>')[0].strip()}\n</{root_tag}>"
        answer_text = loop_verify_format(answer_text, tag_format, [user_message], verify_func, root_tag)
        missing_tags, _ = verify_all_tags_present(answer_text, tags)
    return answer_text

def extract_information_from_pdf(pdf_text, tags):
    tag_text = "\n".join([f"<tag>{tag}</tag>" for tag in tags])
    prompt = INFORMATION_EXTRACTION_PROMPT.format(TEXT=pdf_text, TAGS=tag_text)
    user_message = {"role": "user", "content": [{"type": "text", "text": prompt}]}
    answer_text = ""
    messages = [user_message]
    message = make_llm_api_call(messages)
    message_text = message.content[0].text
    answer_text = f"<answer>\n{message_text.split('<answer>')[1].split('</answer>')[0].strip()}\n</answer>"
    answer_text = loop_verify_format(answer_text, INFORMATION_EXTRACTION_TAG_FORMAT, messages, verify_INFORMATION_EXTRACTION_PROMPT, 'answer')
    answer_text = loop_verify_all_tags_present(answer_text, tags, user_message, INFORMATION_EXTRACTION_PROMPT, verify_INFORMATION_EXTRACTION_PROMPT, 'answer')

    return extract_INFORMATION_EXTRACTION_PROMPT(answer_text)


def extract_text_with_pypdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += f"{page.extract_text()}\n"
    return text.strip()

def get_tag_info_for_pdf(pdf, tags):
    text = extract_text_with_pypdf(pdf)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=100000, chunk_overlap=0)
    chunks = text_splitter.split_text(text)
    tag_data = {tag:"" for tag in tags}
    print("chunk length",len(chunks))
    for chunk in chunks:
        data = extract_information_from_pdf(chunk, tags)
        for tag in tags:
            tag_data.update({tag:f"{tag_data.get(tag)}\n{data.get(tag)}"})
    return tag_data

def do_comparison_process(pdf1_data, pdf2_data, tags):
    tag_data_list = []
    for tag in tags:
        tag_info_text = COMPARISON_INPUT_FORMAT.format(tag=tag, pdf1_information=pdf1_data.get(tag), pdf2_information=pdf2_data.get(tag))
        tag_data_list.append(tag_info_text)
    tag_data_text = "\n".join(tag_data_list)
    prompt = COMPARISON_PROMPT.format(TAG_INFO= tag_data_text)
    user_message = {"role": "user", "content": [{"type": "text", "text": prompt}]}
    message = make_llm_api_call([user_message])
    message_text = message.content[0].text
    comparison_text = f"<comparison>\n{message_text.split('<comparison>')[1].split('</comparison>')[0].strip()}\n</comparison>"
    comparison_text = loop_verify_format(comparison_text, COMPARISON_TAG_FORMAT, [user_message], verify_COMPARISON_PROMPT, 'comparison')
    comparison_text = loop_verify_all_tags_present(comparison_text, tags, user_message, COMPARISON_TAG_FORMAT, verify_COMPARISON_PROMPT, 'comparison')
    
    return extract_COMPARISON_PROMPT(comparison_text)

# def get_pdf_data(pdf1, pdf2, tags):
#     def get_tag_info_for_pdf(pdf, tags):
#         text = extract_text_with_pypdf(pdf)
#         text_splitter = RecursiveCharacterTextSplitter(chunk_size=100000, chunk_overlap=0)
#         chunks = text_splitter.split_text(text)
#         tag_data = {tag:"" for tag in tags}
#         for chunk in chunks:
#             data = extract_information_from_pdf(chunk, tags)
#             for tag in tags:
#                 tag_data.update({tag:f"{tag_data.get(tag)}\n{data.get(tag)}"})
#         return tag_data

#     # Create a ThreadPoolExecutor (or ProcessPoolExecutor for CPU-bound tasks)
#     with ThreadPoolExecutor(max_workers=2) as executor:
#         # Submit the functions to the executor
#         pdf1_future = executor.submit(get_tag_info_for_pdf, pdf1, tags)
#         pdf2_future = executor.submit(get_tag_info_for_pdf, pdf2, tags)

#         # Collect the results
#         pdf1_data = pdf1_future.result()
#         pdf2_data = pdf2_future.result()

#     return pdf1_data, pdf2_data


def process_comparison_data(pdf1, pdf2, tags):
    with st.spinner("Processing PDF 1"):
        pdf1_data = get_tag_info_for_pdf(pdf1, tags)
    with st.spinner("Processing PDF 2"):
        pdf2_data = get_tag_info_for_pdf(pdf2, tags)
    with st.spinner("Generating Comparison Data"):
        comparison_data = do_comparison_process(pdf1_data, pdf2_data, tags)
    # pdf1_data, pdf2_data = get_pdf_data(pdf1, pdf2, tags)
    # comparison_data = do_comparison_process(pdf1_data, pdf2_data, tags)
    table_data = []
    for tag in tags:
        table_data.append((tag, pdf1_data.get(tag), pdf2_data.get(tag), comparison_data.get(tag)))
    df = pd.DataFrame(table_data, columns=['Tags', 'PDF 1', 'PDF 2', 'Difference'])
    df.set_index('Tags', inplace=True)
    return df