Spaces:
Sleeping
Sleeping
robertselvam
commited on
Commit
·
a2780b1
1
Parent(s):
5850aac
Upload 5 files
Browse files- Clauses_Extractor.py +48 -0
- Tags_Extractor.py +39 -0
- key_value_extractor.py +37 -0
- pdftojson.py +84 -0
- summary_extractor.py +121 -0
Clauses_Extractor.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import os
|
3 |
+
|
4 |
+
|
5 |
+
# Define the Clauses class
|
6 |
+
class Clauses:
|
7 |
+
def __init__(self):
|
8 |
+
"""
|
9 |
+
Initialize the Extractor class.
|
10 |
+
"""
|
11 |
+
|
12 |
+
# Set OpenAI API key
|
13 |
+
# os.environ["OPENAI_API_KEY"] = ""
|
14 |
+
|
15 |
+
def get_extracted_clauses(extracted_summary):
|
16 |
+
"""
|
17 |
+
Gets extracted clauses using GPT-3 based on the provided PDF.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
max_tokens (int, optional): Maximum number of tokens for GPT-3 response.
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
str: Extracted clauses from GPT-3 response.
|
24 |
+
"""
|
25 |
+
try:
|
26 |
+
# Prepare a prompt for GPT-3 that includes the extracted PDF text and instructions
|
27 |
+
prompt = f"""
|
28 |
+
Extract clauses and sub-clauses from the provided contract PDF:
|
29 |
+
|
30 |
+
{extracted_summary}
|
31 |
+
|
32 |
+
Instructions: Organize the extracted clauses and sub clauses in a readable format.
|
33 |
+
"""
|
34 |
+
# Use GPT-3 to process the prompt and generate clauses
|
35 |
+
response = openai.Completion.create(
|
36 |
+
engine="text-davinci-003",
|
37 |
+
prompt=prompt,
|
38 |
+
max_tokens=1000
|
39 |
+
)
|
40 |
+
|
41 |
+
# Extract the generated text from the GPT-3 response
|
42 |
+
result = response['choices'][0]['text'].strip()
|
43 |
+
return result
|
44 |
+
|
45 |
+
except Exception as e:
|
46 |
+
# If an error occurs during GPT-3 processing, log the error and raise an exception
|
47 |
+
print(f"Error occurred while processing PDF with GPT-3. Error message: {str(e)}")
|
48 |
+
raise
|
Tags_Extractor.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import os
|
3 |
+
|
4 |
+
|
5 |
+
class Tags:
|
6 |
+
def __init__(self):
|
7 |
+
"""
|
8 |
+
Initialize the Extractor class.
|
9 |
+
"""
|
10 |
+
|
11 |
+
# Set OpenAI API key
|
12 |
+
|
13 |
+
|
14 |
+
# openai.api_key = ""
|
15 |
+
|
16 |
+
|
17 |
+
def extract_tags(extracted_summary):
|
18 |
+
"""
|
19 |
+
Extract tags from the refined summary using OpenAI API.
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
str: Extracted tags.
|
23 |
+
"""
|
24 |
+
try:
|
25 |
+
|
26 |
+
# Use OpenAI's Completion API to analyze the text and extract tags
|
27 |
+
response = openai.Completion.create(
|
28 |
+
engine="text-davinci-003",
|
29 |
+
temperature=0,
|
30 |
+
prompt=f"analyze the given contract to extract tags for following contract in triple backticks. tags should be bullet points.contract :```{extracted_summary}```.",
|
31 |
+
max_tokens=1000
|
32 |
+
)
|
33 |
+
|
34 |
+
# Extract and return the chatbot's reply
|
35 |
+
result = response['choices'][0]['text'].strip()
|
36 |
+
return result
|
37 |
+
|
38 |
+
except Exception as e:
|
39 |
+
print(f"Error occurred while extracting tags: {str(e)}")
|
key_value_extractor.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import os
|
3 |
+
|
4 |
+
|
5 |
+
# Define the KeyValue class
|
6 |
+
class KeyValue:
|
7 |
+
def __init__(self):
|
8 |
+
"""
|
9 |
+
Initialize the Extractor class.
|
10 |
+
"""
|
11 |
+
|
12 |
+
# Set OpenAI API key
|
13 |
+
# os.environ["OPENAI_API_KEY"] = ""
|
14 |
+
|
15 |
+
def extract_key_value_pair(extracted_summary):
|
16 |
+
"""
|
17 |
+
Extract key-value pairs from the refined summary.
|
18 |
+
|
19 |
+
Prints the extracted key-value pairs.
|
20 |
+
"""
|
21 |
+
try:
|
22 |
+
|
23 |
+
|
24 |
+
# Use OpenAI's Completion API to analyze the text and extract key-value pairs
|
25 |
+
response = openai.Completion.create(
|
26 |
+
engine="text-davinci-003",
|
27 |
+
temperature=0,
|
28 |
+
prompt=f"analyze the given contract and get meaningful key value pairs in given content.contract in backticks.```{extracted_summary}```.",
|
29 |
+
max_tokens=1000
|
30 |
+
)
|
31 |
+
|
32 |
+
# Extract and return the chatbot's reply
|
33 |
+
result = response['choices'][0]['text'].strip()
|
34 |
+
return result
|
35 |
+
except Exception as e:
|
36 |
+
# If an error occurs during the key-value extraction process, log the error
|
37 |
+
print(f"Error occurred while extracting key-value pairs: {str(e)}")
|
pdftojson.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import PyPDF2
|
3 |
+
from langchain import PromptTemplate, LLMChain
|
4 |
+
from langchain.llms import OpenAI
|
5 |
+
|
6 |
+
class PdftoJson:
|
7 |
+
|
8 |
+
def __init__(self):
|
9 |
+
"""
|
10 |
+
Initialize the PdftoJson class with OpenAI API key.
|
11 |
+
"""
|
12 |
+
# OPENAI_API_KEY = ""
|
13 |
+
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
|
14 |
+
|
15 |
+
def _get_json(self, input_text: str) -> str:
|
16 |
+
"""
|
17 |
+
Generate JSON result by analyzing and splitting input text into topics and content.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
input_text (str): Text to be analyzed.
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
str: JSON result containing topics and content.
|
24 |
+
"""
|
25 |
+
try:
|
26 |
+
|
27 |
+
# Initialize the OpenAI language model with specified settings
|
28 |
+
llm = OpenAI(temperature=0, max_tokens=1000)
|
29 |
+
|
30 |
+
# Define a template that instructs the model to split input text into topics and content
|
31 |
+
template = """
|
32 |
+
Your task is Get the text and analyse and split it into Topics and Content in json format.Give Proper Name to Topic dont give any Numbers and Dont Give any empty Contents.The Output Format Should Be very good.
|
33 |
+
|
34 |
+
{text}
|
35 |
+
"""
|
36 |
+
prompt = PromptTemplate(template=template, input_variables=["text"])
|
37 |
+
|
38 |
+
# Create an LLMChain instance to chain the prompt and language model together
|
39 |
+
llm_chain = LLMChain(prompt=prompt, llm=llm)
|
40 |
+
|
41 |
+
# Use the provided input text to generate JSON result using the model
|
42 |
+
text = input_text
|
43 |
+
json_result = llm_chain.run(text)
|
44 |
+
|
45 |
+
return json_result
|
46 |
+
|
47 |
+
except Exception as e:
|
48 |
+
print(f"Error occurred while generating JSON result: {str(e)}")
|
49 |
+
|
50 |
+
|
51 |
+
def extract_text_from_pdf(self, pdf_path: str):
|
52 |
+
"""
|
53 |
+
Extract text from a PDF file, generate JSON result, and save it to a file.
|
54 |
+
|
55 |
+
Args:
|
56 |
+
pdf_path (str): Path to the PDF file.
|
57 |
+
"""
|
58 |
+
try:
|
59 |
+
|
60 |
+
|
61 |
+
# Open the PDF file in binary read mode
|
62 |
+
with open(pdf_path, "rb") as pdf_file:
|
63 |
+
# Create a PDF reader object
|
64 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
65 |
+
|
66 |
+
# Iterate through each page in the PDF
|
67 |
+
for page_number in range(len(pdf_reader.pages)):
|
68 |
+
# Extract text from the current page
|
69 |
+
page = pdf_reader.pages[page_number]
|
70 |
+
text = page.extract_text()
|
71 |
+
|
72 |
+
# Generate JSON result for the extracted text
|
73 |
+
json_result = self._get_json(text)
|
74 |
+
|
75 |
+
# # Clear Extra Spaces
|
76 |
+
# clear_json_result = self._remove_empty_lines(json_result)
|
77 |
+
|
78 |
+
# # Save the JSON result to a file
|
79 |
+
# self._save_json(clear_json_result)
|
80 |
+
return json_result
|
81 |
+
|
82 |
+
|
83 |
+
except Exception as e:
|
84 |
+
print(f"Error occurred during extraction and processing: {str(e)}")
|
summary_extractor.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import json
|
3 |
+
from typing import Dict
|
4 |
+
import os
|
5 |
+
from typing import List
|
6 |
+
from langchain.chat_models import ChatOpenAI
|
7 |
+
from langchain.document_loaders import PyPDFLoader
|
8 |
+
from langchain.chains.mapreduce import MapReduceChain
|
9 |
+
from langchain.text_splitter import CharacterTextSplitter
|
10 |
+
from langchain.chains.summarize import load_summarize_chain
|
11 |
+
from langchain.prompts import PromptTemplate
|
12 |
+
|
13 |
+
|
14 |
+
class Extractor:
|
15 |
+
|
16 |
+
"""
|
17 |
+
This class handles the extraction of tags from a PDF document.
|
18 |
+
|
19 |
+
Attributes:
|
20 |
+
config (dict): Configuration settings loaded from a JSON file.
|
21 |
+
pdf_file_path (str): Path to the input PDF file.
|
22 |
+
"""
|
23 |
+
def __init__(self):
|
24 |
+
"""
|
25 |
+
Initialize the Extractor class.
|
26 |
+
"""
|
27 |
+
|
28 |
+
# Set OpenAI API key
|
29 |
+
# os.environ["OPENAI_API_KEY"] = ""
|
30 |
+
|
31 |
+
def _document_loader(self,pdf_file_path) -> List[str]:
|
32 |
+
"""
|
33 |
+
Load and split the PDF document into individual pages.
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
List[str]: List of text content from each page.
|
37 |
+
"""
|
38 |
+
try:
|
39 |
+
loader = PyPDFLoader(pdf_file_path.name)
|
40 |
+
pages = loader.load_and_split()
|
41 |
+
return pages
|
42 |
+
|
43 |
+
except Exception as e:
|
44 |
+
print(f"Error while loading and splitting the document: {str(e)}")
|
45 |
+
|
46 |
+
|
47 |
+
def _document_text_spilliter(self,pdf_file_path) -> List[str]:
|
48 |
+
"""
|
49 |
+
Split the document text into smaller chunks.
|
50 |
+
|
51 |
+
Returns:
|
52 |
+
List[str]: List of smaller text chunks.
|
53 |
+
"""
|
54 |
+
try:
|
55 |
+
# Load the document texts
|
56 |
+
docs = self._document_loader(pdf_file_path)
|
57 |
+
|
58 |
+
# Initialize the text splitter with specified chunk size and overlap
|
59 |
+
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
60 |
+
chunk_size=1000, chunk_overlap=200
|
61 |
+
)
|
62 |
+
|
63 |
+
# Split the documents into chunks
|
64 |
+
split_docs = text_splitter.split_documents(docs)
|
65 |
+
|
66 |
+
# Return the list of split document chunks
|
67 |
+
return split_docs
|
68 |
+
|
69 |
+
except Exception as e:
|
70 |
+
print(f"Error while splitting document text: {str(e)}")
|
71 |
+
|
72 |
+
|
73 |
+
def _refine_summary(self,pdf_file_path) -> str:
|
74 |
+
"""
|
75 |
+
Generate a refined summary of the document using language models.
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
str: Refined summary text.
|
79 |
+
"""
|
80 |
+
try:
|
81 |
+
# Split documents into chunks for efficient processing
|
82 |
+
split_docs = self._document_text_spilliter(pdf_file_path)
|
83 |
+
|
84 |
+
# Prepare the prompt template for summarization
|
85 |
+
prompt_template = """Write a concise summary of the following:
|
86 |
+
{text}
|
87 |
+
CONCISE SUMMARY:"""
|
88 |
+
prompt = PromptTemplate.from_template(prompt_template)
|
89 |
+
|
90 |
+
# Prepare the template for refining the summary with additional context
|
91 |
+
refine_template = (
|
92 |
+
"Your job is to produce a final summary\n"
|
93 |
+
"We have provided an existing summary up to a certain point: {existing_answer}\n"
|
94 |
+
"We have the opportunity to refine the existing summary"
|
95 |
+
"(only if needed) with some more context below.\n"
|
96 |
+
"------------\n"
|
97 |
+
"{text}\n"
|
98 |
+
"------------\n"
|
99 |
+
"Given the new context, refine the original summary"
|
100 |
+
"If the context isn't useful, return the original summary."
|
101 |
+
)
|
102 |
+
refine_prompt = PromptTemplate.from_template(refine_template)
|
103 |
+
|
104 |
+
# Load the summarization chain using the ChatOpenAI language model
|
105 |
+
chain = load_summarize_chain(
|
106 |
+
llm = ChatOpenAI(temperature=0),
|
107 |
+
chain_type="refine",
|
108 |
+
question_prompt=prompt,
|
109 |
+
refine_prompt=refine_prompt,
|
110 |
+
return_intermediate_steps=True,
|
111 |
+
input_key="input_documents",
|
112 |
+
output_key="output_text",
|
113 |
+
)
|
114 |
+
|
115 |
+
# Generate the refined summary using the loaded summarization chain
|
116 |
+
result = chain({"input_documents": split_docs}, return_only_outputs=True)
|
117 |
+
|
118 |
+
return result["output_text"]
|
119 |
+
|
120 |
+
except Exception as e:
|
121 |
+
print(f"Error while generating refined summary: {str(e)}")
|