Spaces:

WebashalarForML
/

SpacyModelCreator

Sleeping

App Files Files Community

WebashalarForML commited on Oct 22, 2024

Commit

2240b0d

verified ·

1 Parent(s): 9a45dad

Update utils/file_to_text.py

Browse files

Files changed (1) hide show

utils/file_to_text.py +131 -131

utils/file_to_text.py CHANGED Viewed

@@ -1,132 +1,132 @@
-import os
-import re
-import fitz
-import logging
-from PIL import Image
-from pdf2image import convert_from_path
-import platform
-import pytesseract
-import docx
-from odf.opendocument import load as load_odt
-from odf.text import P
-# Path to tesseract executable (ensure it points to tesseract.exe)
-if platform.system() == "Windows":
-    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
-else:
-    # For Hugging Face Spaces or other Linux environments
-    pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
-# Set up logging
-# logging.basicConfig(
-#     level=logging.DEBUG,
-#     format='%(asctime)s - %(levelname)s - %(message)s',
-#     handlers=[logging.StreamHandler()]
-# )
-# # Path to Tesseract executable
-# tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
-# pytesseract.pytesseract.tesseract_cmd = tesseract_path
-# Function to extract text from PDF using PyMuPDF
-def extract_text_from_pdf(file_path):
-    text = ""
-    hyperlinks = []
-    try:
-        doc = fitz.open(file_path)
-        for page_num in range(doc.page_count):
-            page = doc.load_page(page_num)
-            page_text = page.get_text("text")
-            if not page_text.strip():
-                images = convert_from_path(file_path, dpi=300)
-                for image in images:
-                    text += pytesseract.image_to_string(image)
-            else:
-                text += page_text
-            links = page.get_links()
-            for link in links:
-                if link.get("uri"):
-                    hyperlinks.append(link["uri"])
-    except Exception as e:
-        logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
-        return "", []
-    return text, list(set(hyperlinks))
-# Function to extract text from DOCX
-def extract_text_from_docx(file_path):
-    try:
-        doc = docx.Document(file_path)
-        text = "\n".join([para.text for para in doc.paragraphs])
-        return text
-    except Exception as e:
-        logging.error(f"Error extracting text from DOCX: {e}")
-        return ""
-# Function to extract text from RSF (assuming text-based format)
-def extract_text_from_rsf(file_path):
-    try:
-        with open(file_path, "r", encoding="utf-8") as file:
-            return file.read()
-    except Exception as e:
-        logging.error(f"Error extracting text from RSF: {e}")
-        return ""
-# Function to extract text from ODT
-def extract_text_from_odt(file_path):
-    try:
-        odt_doc = load_odt(file_path)
-        text_elements = odt_doc.getElementsByType(P)
-        text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
-        return text
-    except Exception as e:
-        logging.error(f"Error extracting text from ODT: {e}")
-        return ""
-# Function to extract text from images using Tesseract
-def extract_text_from_image(file_path):
-    try:
-        img = Image.open(file_path)
-        text = pytesseract.image_to_string(img)
-        return text
-    except Exception as e:
-        logging.error(f"Error extracting text from image: {e}")
-        return ""
-# Function to clean and preprocess the extracted text
-def preprocess_text(text):
-    text = re.sub(r'\s+', ' ', text)
-    text = re.sub(r'\n', ' ', text)
-    text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
-    return text.strip()
-# Function to automatically detect file format and extract text
-def extract_text_based_on_format(file_path):
-    file_ext = os.path.splitext(file_path)[1].lower()
-    if file_ext == '.pdf':
-        text, hyperlinks = extract_text_from_pdf(file_path)
-    elif file_ext == '.docx':
-        text = extract_text_from_docx(file_path)
-        hyperlinks = []
-    elif file_ext == '.rsf':
-        text = extract_text_from_rsf(file_path)
-        hyperlinks = []
-    elif file_ext == '.odt':
-        text = extract_text_from_odt(file_path)
-        hyperlinks = []
-    elif file_ext in ['.png', '.jpg', '.jpeg']:
-        text = extract_text_from_image(file_path)
-        hyperlinks = []
-    else:
-        raise ValueError("Unsupported file format")
-    return text, hyperlinks
-def clean_text_to_single_line(text):
-    # Replace newline characters with a space and remove extra spaces
-    cleaned_text = ' '.join(text.split())
     return cleaned_text

+import os
+import re
+import fitz
+import logging
+from PIL import Image
+from pdf2image import convert_from_path
+import platform
+import pytesseract
+import docx
+from odf.opendocument import load as load_odt
+from odf.text import P
+# Path to tesseract executable (ensure it points to tesseract.exe)
+#if platform.system() == "Windows":
+#    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+#else:
+    # For Hugging Face Spaces or other Linux environments
+pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
+# Set up logging
+# logging.basicConfig(
+#     level=logging.DEBUG,
+#     format='%(asctime)s - %(levelname)s - %(message)s',
+#     handlers=[logging.StreamHandler()]
+# )
+# # Path to Tesseract executable
+# tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
+# pytesseract.pytesseract.tesseract_cmd = tesseract_path
+# Function to extract text from PDF using PyMuPDF
+def extract_text_from_pdf(file_path):
+    text = ""
+    hyperlinks = []
+    try:
+        doc = fitz.open(file_path)
+        for page_num in range(doc.page_count):
+            page = doc.load_page(page_num)
+            page_text = page.get_text("text")
+            if not page_text.strip():
+                images = convert_from_path(file_path, dpi=300)
+                for image in images:
+                    text += pytesseract.image_to_string(image)
+            else:
+                text += page_text
+            links = page.get_links()
+            for link in links:
+                if link.get("uri"):
+                    hyperlinks.append(link["uri"])
+    except Exception as e:
+        logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
+        return "", []
+    return text, list(set(hyperlinks))
+# Function to extract text from DOCX
+def extract_text_from_docx(file_path):
+    try:
+        doc = docx.Document(file_path)
+        text = "\n".join([para.text for para in doc.paragraphs])
+        return text
+    except Exception as e:
+        logging.error(f"Error extracting text from DOCX: {e}")
+        return ""
+# Function to extract text from RSF (assuming text-based format)
+def extract_text_from_rsf(file_path):
+    try:
+        with open(file_path, "r", encoding="utf-8") as file:
+            return file.read()
+    except Exception as e:
+        logging.error(f"Error extracting text from RSF: {e}")
+        return ""
+# Function to extract text from ODT
+def extract_text_from_odt(file_path):
+    try:
+        odt_doc = load_odt(file_path)
+        text_elements = odt_doc.getElementsByType(P)
+        text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
+        return text
+    except Exception as e:
+        logging.error(f"Error extracting text from ODT: {e}")
+        return ""
+# Function to extract text from images using Tesseract
+def extract_text_from_image(file_path):
+    try:
+        img = Image.open(file_path)
+        text = pytesseract.image_to_string(img)
+        return text
+    except Exception as e:
+        logging.error(f"Error extracting text from image: {e}")
+        return ""
+# Function to clean and preprocess the extracted text
+def preprocess_text(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'\n', ' ', text)
+    text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
+    return text.strip()
+# Function to automatically detect file format and extract text
+def extract_text_based_on_format(file_path):
+    file_ext = os.path.splitext(file_path)[1].lower()
+    if file_ext == '.pdf':
+        text, hyperlinks = extract_text_from_pdf(file_path)
+    elif file_ext == '.docx':
+        text = extract_text_from_docx(file_path)
+        hyperlinks = []
+    elif file_ext == '.rsf':
+        text = extract_text_from_rsf(file_path)
+        hyperlinks = []
+    elif file_ext == '.odt':
+        text = extract_text_from_odt(file_path)
+        hyperlinks = []
+    elif file_ext in ['.png', '.jpg', '.jpeg']:
+        text = extract_text_from_image(file_path)
+        hyperlinks = []
+    else:
+        raise ValueError("Unsupported file format")
+    return text, hyperlinks
+def clean_text_to_single_line(text):
+    # Replace newline characters with a space and remove extra spaces
+    cleaned_text = ' '.join(text.split())
     return cleaned_text