robertselvam commited on
Commit
90cd969
·
1 Parent(s): d47f161

Upload headings_extractor.py

Browse files
Files changed (1) hide show
  1. headings_extractor.py +77 -0
headings_extractor.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from PyPDF2 import PdfReader
3
+ import fitz
4
+ import os
5
+
6
+ class HeadingsExtractor:
7
+
8
+ def __init__(self):
9
+ """
10
+ Extract headings from a given paragraph using OpenAI's GPT-3.
11
+
12
+ Args:
13
+ contract_page (str): The paragraph from which headings need to be extracted.
14
+
15
+ Returns:
16
+ str: Extracted headings.
17
+ """
18
+ pass
19
+
20
+ def extarct_headings(self,contract_page: str) -> str:
21
+
22
+ """
23
+ Extract headings from a given paragraph using OpenAI's GPT-3.
24
+
25
+ Args:
26
+ contract_page (str): The paragraph from which headings need to be extracted.
27
+
28
+ Returns:
29
+ str: Extracted headings.
30
+ """
31
+ try:
32
+ #get response from openai api
33
+ response = openai.Completion.create(
34
+ engine="text-davinci-003",
35
+ prompt=f"""Extract Headings from given paragraph do not generate jsu extract the headings from paragraph.
36
+ ```paragraph :{contract_page}```""",
37
+ max_tokens=100,
38
+ temperature=0
39
+ )
40
+ headings = response.choices[0].text.strip()
41
+ return headings
42
+
43
+ except Exception as e:
44
+ # If an error occurs during the key-value extraction process, log the error
45
+ print(f"Error while extracting headings: {str(e)}")
46
+
47
+ def extarct_text(self,pdf_file_path: str) -> str:
48
+
49
+ """
50
+ Extract text from a PDF document and extract headings from each page.
51
+
52
+ Args:
53
+ pdf_file_path (str): Path to the PDF file to extract text from.
54
+
55
+ Returns:
56
+ str: Extracted headings from the PDF document.
57
+ """
58
+ try:
59
+ # Open the multi-page PDF using PdfReader
60
+
61
+ print("path",pdf_file_path)
62
+ pdf = PdfReader(pdf_file_path.name)
63
+ headings = ''
64
+ # Extract text from each page and pass it to the process_text function
65
+ for page_number in range(len(pdf.pages)):
66
+ # Extract text from the page
67
+ page = pdf.pages[page_number]
68
+ text = page.extract_text()
69
+
70
+ # Pass the text to the process_text function for further processing
71
+ result = self.extarct_headings(text)
72
+ headings = headings + result
73
+ return headings
74
+
75
+ except Exception as e:
76
+ # If an error occurs during the key-value extraction process, log the error
77
+ print(f"Error while extracting text from PDF: {str(e)}")