Data_Conversion

Sleeping

App Files Files Community

Data_Conversion / headings_extractor.py

robertselvam

Upload headings_extractor.py

90cd969 about 1 year ago

raw

history blame

2.55 kB

	import openai
	from PyPDF2 import PdfReader
	import fitz
	import os

	class HeadingsExtractor:

	def __init__(self):
	"""
	Extract headings from a given paragraph using OpenAI's GPT-3.

	Args:
	contract_page (str): The paragraph from which headings need to be extracted.

	Returns:
	str: Extracted headings.
	"""
	pass

	def extarct_headings(self,contract_page: str) -> str:

	"""
	Extract headings from a given paragraph using OpenAI's GPT-3.

	Args:
	contract_page (str): The paragraph from which headings need to be extracted.

	Returns:
	str: Extracted headings.
	"""
	try:
	#get response from openai api
	response = openai.Completion.create(
	engine="text-davinci-003",
	prompt=f"""Extract Headings from given paragraph do not generate jsu extract the headings from paragraph.
	```paragraph :{contract_page}```""",
	max_tokens=100,
	temperature=0
	)
	headings = response.choices[0].text.strip()
	return headings

	except Exception as e:
	# If an error occurs during the key-value extraction process, log the error
	print(f"Error while extracting headings: {str(e)}")

	def extarct_text(self,pdf_file_path: str) -> str:

	"""
	Extract text from a PDF document and extract headings from each page.

	Args:
	pdf_file_path (str): Path to the PDF file to extract text from.

	Returns:
	str: Extracted headings from the PDF document.
	"""
	try:
	# Open the multi-page PDF using PdfReader

	print("path",pdf_file_path)
	pdf = PdfReader(pdf_file_path.name)
	headings = ''
	# Extract text from each page and pass it to the process_text function
	for page_number in range(len(pdf.pages)):
	# Extract text from the page
	page = pdf.pages[page_number]
	text = page.extract_text()

	# Pass the text to the process_text function for further processing
	result = self.extarct_headings(text)
	headings = headings + result
	return headings

	except Exception as e:
	# If an error occurs during the key-value extraction process, log the error
	print(f"Error while extracting text from PDF: {str(e)}")