WebashalarForML commited on
Commit
2240b0d
·
verified ·
1 Parent(s): 9a45dad

Update utils/file_to_text.py

Browse files
Files changed (1) hide show
  1. utils/file_to_text.py +131 -131
utils/file_to_text.py CHANGED
@@ -1,132 +1,132 @@
1
- import os
2
- import re
3
- import fitz
4
- import logging
5
- from PIL import Image
6
- from pdf2image import convert_from_path
7
- import platform
8
- import pytesseract
9
- import docx
10
- from odf.opendocument import load as load_odt
11
- from odf.text import P
12
-
13
- # Path to tesseract executable (ensure it points to tesseract.exe)
14
- if platform.system() == "Windows":
15
- pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
16
- else:
17
- # For Hugging Face Spaces or other Linux environments
18
- pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
19
-
20
- # Set up logging
21
- # logging.basicConfig(
22
- # level=logging.DEBUG,
23
- # format='%(asctime)s - %(levelname)s - %(message)s',
24
- # handlers=[logging.StreamHandler()]
25
- # )
26
-
27
- # # Path to Tesseract executable
28
- # tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
29
- # pytesseract.pytesseract.tesseract_cmd = tesseract_path
30
-
31
- # Function to extract text from PDF using PyMuPDF
32
- def extract_text_from_pdf(file_path):
33
- text = ""
34
- hyperlinks = []
35
- try:
36
- doc = fitz.open(file_path)
37
- for page_num in range(doc.page_count):
38
- page = doc.load_page(page_num)
39
- page_text = page.get_text("text")
40
-
41
- if not page_text.strip():
42
- images = convert_from_path(file_path, dpi=300)
43
- for image in images:
44
- text += pytesseract.image_to_string(image)
45
- else:
46
- text += page_text
47
-
48
- links = page.get_links()
49
- for link in links:
50
- if link.get("uri"):
51
- hyperlinks.append(link["uri"])
52
- except Exception as e:
53
- logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
54
- return "", []
55
-
56
- return text, list(set(hyperlinks))
57
-
58
- # Function to extract text from DOCX
59
- def extract_text_from_docx(file_path):
60
- try:
61
- doc = docx.Document(file_path)
62
- text = "\n".join([para.text for para in doc.paragraphs])
63
- return text
64
- except Exception as e:
65
- logging.error(f"Error extracting text from DOCX: {e}")
66
- return ""
67
-
68
- # Function to extract text from RSF (assuming text-based format)
69
- def extract_text_from_rsf(file_path):
70
- try:
71
- with open(file_path, "r", encoding="utf-8") as file:
72
- return file.read()
73
- except Exception as e:
74
- logging.error(f"Error extracting text from RSF: {e}")
75
- return ""
76
-
77
- # Function to extract text from ODT
78
- def extract_text_from_odt(file_path):
79
- try:
80
- odt_doc = load_odt(file_path)
81
- text_elements = odt_doc.getElementsByType(P)
82
- text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
83
- return text
84
- except Exception as e:
85
- logging.error(f"Error extracting text from ODT: {e}")
86
- return ""
87
-
88
- # Function to extract text from images using Tesseract
89
- def extract_text_from_image(file_path):
90
- try:
91
- img = Image.open(file_path)
92
- text = pytesseract.image_to_string(img)
93
- return text
94
- except Exception as e:
95
- logging.error(f"Error extracting text from image: {e}")
96
- return ""
97
-
98
- # Function to clean and preprocess the extracted text
99
- def preprocess_text(text):
100
- text = re.sub(r'\s+', ' ', text)
101
- text = re.sub(r'\n', ' ', text)
102
- text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
103
- return text.strip()
104
-
105
- # Function to automatically detect file format and extract text
106
- def extract_text_based_on_format(file_path):
107
- file_ext = os.path.splitext(file_path)[1].lower()
108
-
109
- if file_ext == '.pdf':
110
- text, hyperlinks = extract_text_from_pdf(file_path)
111
- elif file_ext == '.docx':
112
- text = extract_text_from_docx(file_path)
113
- hyperlinks = []
114
- elif file_ext == '.rsf':
115
- text = extract_text_from_rsf(file_path)
116
- hyperlinks = []
117
- elif file_ext == '.odt':
118
- text = extract_text_from_odt(file_path)
119
- hyperlinks = []
120
- elif file_ext in ['.png', '.jpg', '.jpeg']:
121
- text = extract_text_from_image(file_path)
122
- hyperlinks = []
123
- else:
124
- raise ValueError("Unsupported file format")
125
-
126
- return text, hyperlinks
127
-
128
-
129
- def clean_text_to_single_line(text):
130
- # Replace newline characters with a space and remove extra spaces
131
- cleaned_text = ' '.join(text.split())
132
  return cleaned_text
 
1
+ import os
2
+ import re
3
+ import fitz
4
+ import logging
5
+ from PIL import Image
6
+ from pdf2image import convert_from_path
7
+ import platform
8
+ import pytesseract
9
+ import docx
10
+ from odf.opendocument import load as load_odt
11
+ from odf.text import P
12
+
13
+ # Path to tesseract executable (ensure it points to tesseract.exe)
14
+ #if platform.system() == "Windows":
15
+ # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
16
+ #else:
17
+ # For Hugging Face Spaces or other Linux environments
18
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
19
+
20
+ # Set up logging
21
+ # logging.basicConfig(
22
+ # level=logging.DEBUG,
23
+ # format='%(asctime)s - %(levelname)s - %(message)s',
24
+ # handlers=[logging.StreamHandler()]
25
+ # )
26
+
27
+ # # Path to Tesseract executable
28
+ # tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
29
+ # pytesseract.pytesseract.tesseract_cmd = tesseract_path
30
+
31
+ # Function to extract text from PDF using PyMuPDF
32
+ def extract_text_from_pdf(file_path):
33
+ text = ""
34
+ hyperlinks = []
35
+ try:
36
+ doc = fitz.open(file_path)
37
+ for page_num in range(doc.page_count):
38
+ page = doc.load_page(page_num)
39
+ page_text = page.get_text("text")
40
+
41
+ if not page_text.strip():
42
+ images = convert_from_path(file_path, dpi=300)
43
+ for image in images:
44
+ text += pytesseract.image_to_string(image)
45
+ else:
46
+ text += page_text
47
+
48
+ links = page.get_links()
49
+ for link in links:
50
+ if link.get("uri"):
51
+ hyperlinks.append(link["uri"])
52
+ except Exception as e:
53
+ logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
54
+ return "", []
55
+
56
+ return text, list(set(hyperlinks))
57
+
58
+ # Function to extract text from DOCX
59
+ def extract_text_from_docx(file_path):
60
+ try:
61
+ doc = docx.Document(file_path)
62
+ text = "\n".join([para.text for para in doc.paragraphs])
63
+ return text
64
+ except Exception as e:
65
+ logging.error(f"Error extracting text from DOCX: {e}")
66
+ return ""
67
+
68
+ # Function to extract text from RSF (assuming text-based format)
69
+ def extract_text_from_rsf(file_path):
70
+ try:
71
+ with open(file_path, "r", encoding="utf-8") as file:
72
+ return file.read()
73
+ except Exception as e:
74
+ logging.error(f"Error extracting text from RSF: {e}")
75
+ return ""
76
+
77
+ # Function to extract text from ODT
78
+ def extract_text_from_odt(file_path):
79
+ try:
80
+ odt_doc = load_odt(file_path)
81
+ text_elements = odt_doc.getElementsByType(P)
82
+ text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
83
+ return text
84
+ except Exception as e:
85
+ logging.error(f"Error extracting text from ODT: {e}")
86
+ return ""
87
+
88
+ # Function to extract text from images using Tesseract
89
+ def extract_text_from_image(file_path):
90
+ try:
91
+ img = Image.open(file_path)
92
+ text = pytesseract.image_to_string(img)
93
+ return text
94
+ except Exception as e:
95
+ logging.error(f"Error extracting text from image: {e}")
96
+ return ""
97
+
98
+ # Function to clean and preprocess the extracted text
99
+ def preprocess_text(text):
100
+ text = re.sub(r'\s+', ' ', text)
101
+ text = re.sub(r'\n', ' ', text)
102
+ text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
103
+ return text.strip()
104
+
105
+ # Function to automatically detect file format and extract text
106
+ def extract_text_based_on_format(file_path):
107
+ file_ext = os.path.splitext(file_path)[1].lower()
108
+
109
+ if file_ext == '.pdf':
110
+ text, hyperlinks = extract_text_from_pdf(file_path)
111
+ elif file_ext == '.docx':
112
+ text = extract_text_from_docx(file_path)
113
+ hyperlinks = []
114
+ elif file_ext == '.rsf':
115
+ text = extract_text_from_rsf(file_path)
116
+ hyperlinks = []
117
+ elif file_ext == '.odt':
118
+ text = extract_text_from_odt(file_path)
119
+ hyperlinks = []
120
+ elif file_ext in ['.png', '.jpg', '.jpeg']:
121
+ text = extract_text_from_image(file_path)
122
+ hyperlinks = []
123
+ else:
124
+ raise ValueError("Unsupported file format")
125
+
126
+ return text, hyperlinks
127
+
128
+
129
+ def clean_text_to_single_line(text):
130
+ # Replace newline characters with a space and remove extra spaces
131
+ cleaned_text = ' '.join(text.split())
132
  return cleaned_text