robertselvam commited on
Commit
a2780b1
·
1 Parent(s): 5850aac

Upload 5 files

Browse files
Clauses_Extractor.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+
4
+
5
+ # Define the Clauses class
6
+ class Clauses:
7
+ def __init__(self):
8
+ """
9
+ Initialize the Extractor class.
10
+ """
11
+
12
+ # Set OpenAI API key
13
+ # os.environ["OPENAI_API_KEY"] = ""
14
+
15
+ def get_extracted_clauses(extracted_summary):
16
+ """
17
+ Gets extracted clauses using GPT-3 based on the provided PDF.
18
+
19
+ Args:
20
+ max_tokens (int, optional): Maximum number of tokens for GPT-3 response.
21
+
22
+ Returns:
23
+ str: Extracted clauses from GPT-3 response.
24
+ """
25
+ try:
26
+ # Prepare a prompt for GPT-3 that includes the extracted PDF text and instructions
27
+ prompt = f"""
28
+ Extract clauses and sub-clauses from the provided contract PDF:
29
+
30
+ {extracted_summary}
31
+
32
+ Instructions: Organize the extracted clauses and sub clauses in a readable format.
33
+ """
34
+ # Use GPT-3 to process the prompt and generate clauses
35
+ response = openai.Completion.create(
36
+ engine="text-davinci-003",
37
+ prompt=prompt,
38
+ max_tokens=1000
39
+ )
40
+
41
+ # Extract the generated text from the GPT-3 response
42
+ result = response['choices'][0]['text'].strip()
43
+ return result
44
+
45
+ except Exception as e:
46
+ # If an error occurs during GPT-3 processing, log the error and raise an exception
47
+ print(f"Error occurred while processing PDF with GPT-3. Error message: {str(e)}")
48
+ raise
Tags_Extractor.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+
4
+
5
+ class Tags:
6
+ def __init__(self):
7
+ """
8
+ Initialize the Extractor class.
9
+ """
10
+
11
+ # Set OpenAI API key
12
+
13
+
14
+ # openai.api_key = ""
15
+
16
+
17
+ def extract_tags(extracted_summary):
18
+ """
19
+ Extract tags from the refined summary using OpenAI API.
20
+
21
+ Returns:
22
+ str: Extracted tags.
23
+ """
24
+ try:
25
+
26
+ # Use OpenAI's Completion API to analyze the text and extract tags
27
+ response = openai.Completion.create(
28
+ engine="text-davinci-003",
29
+ temperature=0,
30
+ prompt=f"analyze the given contract to extract tags for following contract in triple backticks. tags should be bullet points.contract :```{extracted_summary}```.",
31
+ max_tokens=1000
32
+ )
33
+
34
+ # Extract and return the chatbot's reply
35
+ result = response['choices'][0]['text'].strip()
36
+ return result
37
+
38
+ except Exception as e:
39
+ print(f"Error occurred while extracting tags: {str(e)}")
key_value_extractor.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+
4
+
5
+ # Define the KeyValue class
6
+ class KeyValue:
7
+ def __init__(self):
8
+ """
9
+ Initialize the Extractor class.
10
+ """
11
+
12
+ # Set OpenAI API key
13
+ # os.environ["OPENAI_API_KEY"] = ""
14
+
15
+ def extract_key_value_pair(extracted_summary):
16
+ """
17
+ Extract key-value pairs from the refined summary.
18
+
19
+ Prints the extracted key-value pairs.
20
+ """
21
+ try:
22
+
23
+
24
+ # Use OpenAI's Completion API to analyze the text and extract key-value pairs
25
+ response = openai.Completion.create(
26
+ engine="text-davinci-003",
27
+ temperature=0,
28
+ prompt=f"analyze the given contract and get meaningful key value pairs in given content.contract in backticks.```{extracted_summary}```.",
29
+ max_tokens=1000
30
+ )
31
+
32
+ # Extract and return the chatbot's reply
33
+ result = response['choices'][0]['text'].strip()
34
+ return result
35
+ except Exception as e:
36
+ # If an error occurs during the key-value extraction process, log the error
37
+ print(f"Error occurred while extracting key-value pairs: {str(e)}")
pdftojson.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import PyPDF2
3
+ from langchain import PromptTemplate, LLMChain
4
+ from langchain.llms import OpenAI
5
+
6
+ class PdftoJson:
7
+
8
+ def __init__(self):
9
+ """
10
+ Initialize the PdftoJson class with OpenAI API key.
11
+ """
12
+ # OPENAI_API_KEY = ""
13
+ # os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
14
+
15
+ def _get_json(self, input_text: str) -> str:
16
+ """
17
+ Generate JSON result by analyzing and splitting input text into topics and content.
18
+
19
+ Args:
20
+ input_text (str): Text to be analyzed.
21
+
22
+ Returns:
23
+ str: JSON result containing topics and content.
24
+ """
25
+ try:
26
+
27
+ # Initialize the OpenAI language model with specified settings
28
+ llm = OpenAI(temperature=0, max_tokens=1000)
29
+
30
+ # Define a template that instructs the model to split input text into topics and content
31
+ template = """
32
+ Your task is Get the text and analyse and split it into Topics and Content in json format.Give Proper Name to Topic dont give any Numbers and Dont Give any empty Contents.The Output Format Should Be very good.
33
+
34
+ {text}
35
+ """
36
+ prompt = PromptTemplate(template=template, input_variables=["text"])
37
+
38
+ # Create an LLMChain instance to chain the prompt and language model together
39
+ llm_chain = LLMChain(prompt=prompt, llm=llm)
40
+
41
+ # Use the provided input text to generate JSON result using the model
42
+ text = input_text
43
+ json_result = llm_chain.run(text)
44
+
45
+ return json_result
46
+
47
+ except Exception as e:
48
+ print(f"Error occurred while generating JSON result: {str(e)}")
49
+
50
+
51
+ def extract_text_from_pdf(self, pdf_path: str):
52
+ """
53
+ Extract text from a PDF file, generate JSON result, and save it to a file.
54
+
55
+ Args:
56
+ pdf_path (str): Path to the PDF file.
57
+ """
58
+ try:
59
+
60
+
61
+ # Open the PDF file in binary read mode
62
+ with open(pdf_path, "rb") as pdf_file:
63
+ # Create a PDF reader object
64
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
65
+
66
+ # Iterate through each page in the PDF
67
+ for page_number in range(len(pdf_reader.pages)):
68
+ # Extract text from the current page
69
+ page = pdf_reader.pages[page_number]
70
+ text = page.extract_text()
71
+
72
+ # Generate JSON result for the extracted text
73
+ json_result = self._get_json(text)
74
+
75
+ # # Clear Extra Spaces
76
+ # clear_json_result = self._remove_empty_lines(json_result)
77
+
78
+ # # Save the JSON result to a file
79
+ # self._save_json(clear_json_result)
80
+ return json_result
81
+
82
+
83
+ except Exception as e:
84
+ print(f"Error occurred during extraction and processing: {str(e)}")
summary_extractor.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import json
3
+ from typing import Dict
4
+ import os
5
+ from typing import List
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.document_loaders import PyPDFLoader
8
+ from langchain.chains.mapreduce import MapReduceChain
9
+ from langchain.text_splitter import CharacterTextSplitter
10
+ from langchain.chains.summarize import load_summarize_chain
11
+ from langchain.prompts import PromptTemplate
12
+
13
+
14
+ class Extractor:
15
+
16
+ """
17
+ This class handles the extraction of tags from a PDF document.
18
+
19
+ Attributes:
20
+ config (dict): Configuration settings loaded from a JSON file.
21
+ pdf_file_path (str): Path to the input PDF file.
22
+ """
23
+ def __init__(self):
24
+ """
25
+ Initialize the Extractor class.
26
+ """
27
+
28
+ # Set OpenAI API key
29
+ # os.environ["OPENAI_API_KEY"] = ""
30
+
31
+ def _document_loader(self,pdf_file_path) -> List[str]:
32
+ """
33
+ Load and split the PDF document into individual pages.
34
+
35
+ Returns:
36
+ List[str]: List of text content from each page.
37
+ """
38
+ try:
39
+ loader = PyPDFLoader(pdf_file_path.name)
40
+ pages = loader.load_and_split()
41
+ return pages
42
+
43
+ except Exception as e:
44
+ print(f"Error while loading and splitting the document: {str(e)}")
45
+
46
+
47
+ def _document_text_spilliter(self,pdf_file_path) -> List[str]:
48
+ """
49
+ Split the document text into smaller chunks.
50
+
51
+ Returns:
52
+ List[str]: List of smaller text chunks.
53
+ """
54
+ try:
55
+ # Load the document texts
56
+ docs = self._document_loader(pdf_file_path)
57
+
58
+ # Initialize the text splitter with specified chunk size and overlap
59
+ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
60
+ chunk_size=1000, chunk_overlap=200
61
+ )
62
+
63
+ # Split the documents into chunks
64
+ split_docs = text_splitter.split_documents(docs)
65
+
66
+ # Return the list of split document chunks
67
+ return split_docs
68
+
69
+ except Exception as e:
70
+ print(f"Error while splitting document text: {str(e)}")
71
+
72
+
73
+ def _refine_summary(self,pdf_file_path) -> str:
74
+ """
75
+ Generate a refined summary of the document using language models.
76
+
77
+ Returns:
78
+ str: Refined summary text.
79
+ """
80
+ try:
81
+ # Split documents into chunks for efficient processing
82
+ split_docs = self._document_text_spilliter(pdf_file_path)
83
+
84
+ # Prepare the prompt template for summarization
85
+ prompt_template = """Write a concise summary of the following:
86
+ {text}
87
+ CONCISE SUMMARY:"""
88
+ prompt = PromptTemplate.from_template(prompt_template)
89
+
90
+ # Prepare the template for refining the summary with additional context
91
+ refine_template = (
92
+ "Your job is to produce a final summary\n"
93
+ "We have provided an existing summary up to a certain point: {existing_answer}\n"
94
+ "We have the opportunity to refine the existing summary"
95
+ "(only if needed) with some more context below.\n"
96
+ "------------\n"
97
+ "{text}\n"
98
+ "------------\n"
99
+ "Given the new context, refine the original summary"
100
+ "If the context isn't useful, return the original summary."
101
+ )
102
+ refine_prompt = PromptTemplate.from_template(refine_template)
103
+
104
+ # Load the summarization chain using the ChatOpenAI language model
105
+ chain = load_summarize_chain(
106
+ llm = ChatOpenAI(temperature=0),
107
+ chain_type="refine",
108
+ question_prompt=prompt,
109
+ refine_prompt=refine_prompt,
110
+ return_intermediate_steps=True,
111
+ input_key="input_documents",
112
+ output_key="output_text",
113
+ )
114
+
115
+ # Generate the refined summary using the loaded summarization chain
116
+ result = chain({"input_documents": split_docs}, return_only_outputs=True)
117
+
118
+ return result["output_text"]
119
+
120
+ except Exception as e:
121
+ print(f"Error while generating refined summary: {str(e)}")