rag / extract_text.py
Jose Alvaro Luna G
feat: app init2
da6a7d2
raw
history blame contribute delete
528 Bytes
from pdfminer.high_level import extract_text
from docx import Document
import pytesseract
from PIL import Image
def extract_text_from_image(file_path):
image = Image.open(file_path)
text = pytesseract.image_to_string(image)
return text
def extract_text_from_docx(file_path):
doc = Document(file_path)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return '\n'.join(full_text)
def extract_text_from_pdf(file_path):
text = extract_text(file_path)
return text