SpacyModelCreator / utils /file_to_text.py
WebashalarForML's picture
Update utils/file_to_text.py
2240b0d verified
raw
history blame
4.31 kB
import os
import re
import fitz
import logging
from PIL import Image
from pdf2image import convert_from_path
import platform
import pytesseract
import docx
from odf.opendocument import load as load_odt
from odf.text import P
# Path to tesseract executable (ensure it points to tesseract.exe)
#if platform.system() == "Windows":
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
#else:
# For Hugging Face Spaces or other Linux environments
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
# Set up logging
# logging.basicConfig(
# level=logging.DEBUG,
# format='%(asctime)s - %(levelname)s - %(message)s',
# handlers=[logging.StreamHandler()]
# )
# # Path to Tesseract executable
# tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
# pytesseract.pytesseract.tesseract_cmd = tesseract_path
# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(file_path):
text = ""
hyperlinks = []
try:
doc = fitz.open(file_path)
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
page_text = page.get_text("text")
if not page_text.strip():
images = convert_from_path(file_path, dpi=300)
for image in images:
text += pytesseract.image_to_string(image)
else:
text += page_text
links = page.get_links()
for link in links:
if link.get("uri"):
hyperlinks.append(link["uri"])
except Exception as e:
logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
return "", []
return text, list(set(hyperlinks))
# Function to extract text from DOCX
def extract_text_from_docx(file_path):
try:
doc = docx.Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs])
return text
except Exception as e:
logging.error(f"Error extracting text from DOCX: {e}")
return ""
# Function to extract text from RSF (assuming text-based format)
def extract_text_from_rsf(file_path):
try:
with open(file_path, "r", encoding="utf-8") as file:
return file.read()
except Exception as e:
logging.error(f"Error extracting text from RSF: {e}")
return ""
# Function to extract text from ODT
def extract_text_from_odt(file_path):
try:
odt_doc = load_odt(file_path)
text_elements = odt_doc.getElementsByType(P)
text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
return text
except Exception as e:
logging.error(f"Error extracting text from ODT: {e}")
return ""
# Function to extract text from images using Tesseract
def extract_text_from_image(file_path):
try:
img = Image.open(file_path)
text = pytesseract.image_to_string(img)
return text
except Exception as e:
logging.error(f"Error extracting text from image: {e}")
return ""
# Function to clean and preprocess the extracted text
def preprocess_text(text):
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\n', ' ', text)
text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
return text.strip()
# Function to automatically detect file format and extract text
def extract_text_based_on_format(file_path):
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
text, hyperlinks = extract_text_from_pdf(file_path)
elif file_ext == '.docx':
text = extract_text_from_docx(file_path)
hyperlinks = []
elif file_ext == '.rsf':
text = extract_text_from_rsf(file_path)
hyperlinks = []
elif file_ext == '.odt':
text = extract_text_from_odt(file_path)
hyperlinks = []
elif file_ext in ['.png', '.jpg', '.jpeg']:
text = extract_text_from_image(file_path)
hyperlinks = []
else:
raise ValueError("Unsupported file format")
return text, hyperlinks
def clean_text_to_single_line(text):
# Replace newline characters with a space and remove extra spaces
cleaned_text = ' '.join(text.split())
return cleaned_text