Spaces:

GAS17
/

pdfextract

Runtime error

GAS17 commited on Dec 21, 2024

Commit

9204aaf

verified ·

1 Parent(s): b3e1fb5

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,34 +1,49 @@
 import gradio as gr
 import io
-from doctr.io import DocumentFile
-from doctr.models import ocr_predictor
 # Initialize the OCR model
-model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
 def ocr_process(file):
-    # Read the uploaded file
-    if file.name.lower().endswith('.pdf'):
-        doc = DocumentFile.from_pdf(file.name)
-    else:
-        # Assume it's an image if not PDF
-        image_stream = io.BytesIO(file.read())
-        doc = DocumentFile.from_images(image_stream)
-    # Perform OCR
-    result = model(doc)
-    # Extract text from the result
-    extracted_text = ""
-    for page in result.pages:
-        for block in page.blocks:
-            for line in block.lines:
-                for word in line.words:
-                    extracted_text += word.value + " "
                 extracted_text += "\n"
-            extracted_text += "\n"
-    return extracted_text.strip()
 # Create Gradio interface
 iface = gr.Interface(

 import gradio as gr
 import io
+import sys
+try:
+    from doctr.io import DocumentFile
+    from doctr.models import ocr_predictor
+except ImportError:
+    print("Error: Failed to import doctr. Please ensure it's installed correctly.")
+    print("Python version:", sys.version)
+    print("Python path:", sys.path)
+    raise
 # Initialize the OCR model
+try:
+    model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
+except Exception as e:
+    print(f"Error initializing OCR model: {e}")
+    raise
 def ocr_process(file):
+    try:
+        # Read the uploaded file
+        if file.name.lower().endswith('.pdf'):
+            doc = DocumentFile.from_pdf(file.name)
+        else:
+            # Assume it's an image if not PDF
+            image_stream = io.BytesIO(file.read())
+            doc = DocumentFile.from_images(image_stream)
+        # Perform OCR
+        result = model(doc)
+        # Extract text from the result
+        extracted_text = ""
+        for page in result.pages:
+            for block in page.blocks:
+                for line in block.lines:
+                    for word in line.words:
+                        extracted_text += word.value + " "
+                    extracted_text += "\n"
                 extracted_text += "\n"
+        return extracted_text.strip()
+    except Exception as e:
+        return f"Error processing file: {str(e)}"
 # Create Gradio interface
 iface = gr.Interface(