Data_Conversion / summary_extractor.py
robertselvam's picture
Update summary_extractor.py
1b2c45f verified
import os
import json
from typing import Dict
from typing import List
# os.system("pip install langchain-openai")
from langchain_openai import AzureChatOpenAI
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
# from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.document_loaders import PyPDFLoader
class Extractor:
"""
This class handles the extraction of tags from a PDF document.
Attributes:
config (dict): Configuration settings loaded from a JSON file.
pdf_file_path (str): Path to the input PDF file.
"""
def __init__(self):
"""
Initialize the Extractor class.
"""
pass
def _document_loader(self,pdf_file_path) -> List[str]:
"""
Load and split the PDF document into individual pages.
Returns:
List[str]: List of text content from each page.
"""
try:
loader = PyPDFLoader(pdf_file_path.name)
pages = loader.load_and_split()
return pages
except Exception as e:
print(f"Error while loading and splitting the document: {str(e)}")
def _document_text_spilliter(self,pdf_file_path) -> List[str]:
"""
Split the document text into smaller chunks.
Returns:
List[str]: List of smaller text chunks.
"""
try:
# Load the document texts
docs = self._document_loader(pdf_file_path)
# Initialize the text splitter with specified chunk size and overlap
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=1000, chunk_overlap=200
)
# Split the documents into chunks
split_docs = text_splitter.split_documents(docs)
# Return the list of split document chunks
return split_docs
except Exception as e:
print(f"Error while splitting document text: {str(e)}")
def _refine_summary(self,pdf_file_path) -> str:
"""
Generate a refined summary of the document using language models.
Returns:
str: Refined summary text.
"""
try:
# Split documents into chunks for efficient processing
split_docs = self._document_text_spilliter(pdf_file_path)
# Prepare the prompt template for summarization
prompt_template = """Write a concise summary of the following Contract:
Contrcat : {text}
CONCISE SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)
# Prepare the template for refining the summary with additional context
refine_template = (
"Your job is to produce a final summary\n"
"We have provided an existing summary up to a certain point: {existing_answer}\n"
"We have the opportunity to refine the existing summary"
"(only if needed) with some more context below.\n"
"------------\n"
"{text}\n"
"------------\n"
"Given the new context, refine the original summary"
"If the context isn't useful, return the original summary."
)
refine_prompt = PromptTemplate.from_template(refine_template)
# Load the summarization chain using the ChatOpenAI language model
chain = load_summarize_chain(
llm = AzureChatOpenAI(azure_deployment = "GPT-3"),
chain_type="refine",
question_prompt=prompt,
refine_prompt=refine_prompt,
return_intermediate_steps=True,
input_key="input_documents",
output_key="output_text",
)
# Generate the refined summary using the loaded summarization chain
result = chain({"input_documents": split_docs}, return_only_outputs=True)
return result["output_text"]
except Exception as e:
print(f"Error while generating refined summary: {str(e)}")