Spaces:
Runtime error
Runtime error
import os | |
import re | |
import fitz | |
import logging | |
from PIL import Image | |
from pdf2image import convert_from_path | |
import platform | |
import pytesseract | |
import docx | |
from odf.opendocument import load as load_odt | |
from odf.text import P | |
# Path to tesseract executable (ensure it points to tesseract.exe) | |
#if platform.system() == "Windows": | |
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' | |
#else: | |
# For Hugging Face Spaces or other Linux environments | |
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' | |
# Set up logging | |
# logging.basicConfig( | |
# level=logging.DEBUG, | |
# format='%(asctime)s - %(levelname)s - %(message)s', | |
# handlers=[logging.StreamHandler()] | |
# ) | |
# # Path to Tesseract executable | |
# tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract') | |
# pytesseract.pytesseract.tesseract_cmd = tesseract_path | |
# Function to extract text from PDF using PyMuPDF | |
def extract_text_from_pdf(file_path): | |
text = "" | |
hyperlinks = [] | |
try: | |
doc = fitz.open(file_path) | |
for page_num in range(doc.page_count): | |
page = doc.load_page(page_num) | |
page_text = page.get_text("text") | |
if not page_text.strip(): | |
images = convert_from_path(file_path, dpi=300) | |
for image in images: | |
text += pytesseract.image_to_string(image) | |
else: | |
text += page_text | |
links = page.get_links() | |
for link in links: | |
if link.get("uri"): | |
hyperlinks.append(link["uri"]) | |
except Exception as e: | |
logging.error(f"Error extracting text or hyperlinks from PDF: {e}") | |
return "", [] | |
return text, list(set(hyperlinks)) | |
# Function to extract text from DOCX | |
def extract_text_from_docx(file_path): | |
try: | |
doc = docx.Document(file_path) | |
text = "\n".join([para.text for para in doc.paragraphs]) | |
return text | |
except Exception as e: | |
logging.error(f"Error extracting text from DOCX: {e}") | |
return "" | |
# Function to extract text from RSF (assuming text-based format) | |
def extract_text_from_rsf(file_path): | |
try: | |
with open(file_path, "r", encoding="utf-8") as file: | |
return file.read() | |
except Exception as e: | |
logging.error(f"Error extracting text from RSF: {e}") | |
return "" | |
# Function to extract text from ODT | |
def extract_text_from_odt(file_path): | |
try: | |
odt_doc = load_odt(file_path) | |
text_elements = odt_doc.getElementsByType(P) | |
text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild]) | |
return text | |
except Exception as e: | |
logging.error(f"Error extracting text from ODT: {e}") | |
return "" | |
# Function to extract text from images using Tesseract | |
def extract_text_from_image(file_path): | |
try: | |
img = Image.open(file_path) | |
text = pytesseract.image_to_string(img) | |
return text | |
except Exception as e: | |
logging.error(f"Error extracting text from image: {e}") | |
return "" | |
# Function to clean and preprocess the extracted text | |
def preprocess_text(text): | |
text = re.sub(r'\s+', ' ', text) | |
text = re.sub(r'\n', ' ', text) | |
text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text) | |
return text.strip() | |
# Function to automatically detect file format and extract text | |
def extract_text_based_on_format(file_path): | |
file_ext = os.path.splitext(file_path)[1].lower() | |
if file_ext == '.pdf': | |
text, hyperlinks = extract_text_from_pdf(file_path) | |
elif file_ext == '.docx': | |
text = extract_text_from_docx(file_path) | |
hyperlinks = [] | |
elif file_ext == '.rsf': | |
text = extract_text_from_rsf(file_path) | |
hyperlinks = [] | |
elif file_ext == '.odt': | |
text = extract_text_from_odt(file_path) | |
hyperlinks = [] | |
elif file_ext in ['.png', '.jpg', '.jpeg']: | |
text = extract_text_from_image(file_path) | |
hyperlinks = [] | |
else: | |
raise ValueError("Unsupported file format") | |
return text, hyperlinks | |
def clean_text_to_single_line(text): | |
# Replace newline characters with a space and remove extra spaces | |
cleaned_text = ' '.join(text.split()) | |
return cleaned_text |