Spaces:

adit94
/

entity_extraction

Runtime error

App Files Files Community

entity_extraction / services /ocr_service.py

adit94

Upload 9 files

846b4a5 verified 8 months ago

raw

history blame

3.01 kB

	import os
	import re
	import docx
	import pytesseract
	from nltk.tokenize import sent_tokenize, word_tokenize
	from PyPDF2 import PdfReader
	from pdf2image import convert_from_path


	class OCRService:
	def __init__(self):
	return

	def extract_ocrless_pdf(self, filepath):
	reader = PdfReader(filepath)
	extracted_text = ""
	for page in reader.pages:
	text = page.extract_text()
	extracted_text += " "
	extracted_text += text

	return extracted_text

	def extract_text_from_pdf(self, filepath):
	images = convert_from_path(filepath, thread_count=4)
	full_text = []
	#config = (r"--oem 2 --psm 7")
	for image_idx, image in enumerate(images):
	text = pytesseract.image_to_string(image)
	#text = pytesseract.image_to_string(image, config=config)
	full_text.append(text)
	return full_text

	def extract_text_from_document(self, filepath):
	file_ext = os.path.splitext(filepath)[-1]
	if file_ext in [".pdf"]:
	text_to_process = self.extract_text_from_pdf(filepath)
	text_joined = " ".join(text_to_process)
	#with open(f"{os.path.splitext(filepath)[0]}.txt", "w") as file:
	# file.writelines(text_to_process)
	elif file_ext in [".doc", ".DOC", ".docx", ".DOCX"]:
	doc_content = docx.Document(filepath)
	text_to_process = [i.text for i in doc_content.paragraphs]
	text_joined = " \n ".join(text_to_process)
	#with open(f"{os.path.splitext(filepath)[0]}.txt", "w") as file:
	# file.write(text_joined)
	elif file_ext in [".txt"]:
	file = open(f"{os.path.splitext(filepath)[0]}.txt", encoding="utf8")
	text_joined = file.read()

	return text_joined

	def preprocess_document(self, document):
	document = document.replace(r'\n+', "\n")
	#document = re.sub(r"\s+", " ", document)
	document = re.sub("“", r"\"", document)
	document = re.sub("”", r"\"", document)
	document = re.sub(r"\\\"", "\"", document)

	return document

	def chunk_document(self, text, k=1500):
	sentences = sent_tokenize(text)
	words = word_tokenize(text)

	chunks = []
	current_chunk = []
	current_word_count = 0

	for sentence in sentences:
	sentence_words = word_tokenize(sentence)
	if current_word_count + len(sentence_words) <= k:
	current_chunk.append(sentence)
	current_word_count += len(sentence_words)
	else:
	chunks.append(" ".join(current_chunk))
	current_chunk = [sentence]
	current_word_count = len(sentence_words)

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	for id, chunk in enumerate(chunks):
	if len(chunk.split()) < 2:
	del chunks[id]

	return chunks