Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,92 +1,28 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
"
|
22 |
-
|
23 |
-
|
24 |
-
"
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
"vitstr_small",
|
30 |
-
"vitstr_base",
|
31 |
-
"parseq",
|
32 |
-
]
|
33 |
-
|
34 |
-
|
35 |
-
def load_predictor(
|
36 |
-
det_arch: str,
|
37 |
-
reco_arch: str,
|
38 |
-
assume_straight_pages: bool,
|
39 |
-
straighten_pages: bool,
|
40 |
-
export_as_straight_boxes: bool,
|
41 |
-
disable_page_orientation: bool,
|
42 |
-
disable_crop_orientation: bool,
|
43 |
-
bin_thresh: float,
|
44 |
-
box_thresh: float,
|
45 |
-
device: torch.device,
|
46 |
-
) -> OCRPredictor:
|
47 |
-
"""Load a predictor from doctr.models
|
48 |
-
Args:
|
49 |
-
det_arch: detection architecture
|
50 |
-
reco_arch: recognition architecture
|
51 |
-
assume_straight_pages: whether to assume straight pages or not
|
52 |
-
straighten_pages: whether to straighten rotated pages or not
|
53 |
-
export_as_straight_boxes: whether to export boxes as straight or not
|
54 |
-
disable_page_orientation: whether to disable page orientation or not
|
55 |
-
disable_crop_orientation: whether to disable crop orientation or not
|
56 |
-
bin_thresh: binarization threshold for the segmentation map
|
57 |
-
box_thresh: minimal objectness score to consider a box
|
58 |
-
device: torch.device, the device to load the predictor on
|
59 |
-
Returns:
|
60 |
-
instance of OCRPredictor
|
61 |
-
"""
|
62 |
-
predictor = ocr_predictor(
|
63 |
-
det_arch,
|
64 |
-
reco_arch,
|
65 |
-
pretrained=True,
|
66 |
-
assume_straight_pages=assume_straight_pages,
|
67 |
-
straighten_pages=straighten_pages,
|
68 |
-
export_as_straight_boxes=export_as_straight_boxes,
|
69 |
-
detect_orientation=not assume_straight_pages,
|
70 |
-
disable_page_orientation=disable_page_orientation,
|
71 |
-
disable_crop_orientation=disable_crop_orientation,
|
72 |
-
).to(device)
|
73 |
-
predictor.det_predictor.model.postprocessor.bin_thresh = bin_thresh
|
74 |
-
predictor.det_predictor.model.postprocessor.box_thresh = box_thresh
|
75 |
-
return predictor
|
76 |
-
|
77 |
-
|
78 |
-
def forward_image(predictor: OCRPredictor, image: np.ndarray, device: torch.device) -> np.ndarray:
|
79 |
-
"""Forward an image through the predictor
|
80 |
-
Args:
|
81 |
-
predictor: instance of OCRPredictor
|
82 |
-
image: image to process
|
83 |
-
device: torch.device, the device to process the image on
|
84 |
-
Returns:
|
85 |
-
segmentation map
|
86 |
-
"""
|
87 |
-
with torch.no_grad():
|
88 |
-
processed_batches = predictor.det_predictor.pre_processor([image])
|
89 |
-
out = predictor.det_predictor.model(processed_batches[0].to(device), return_model_output=True)
|
90 |
-
seg_map = out["out_map"].to("cpu").numpy()
|
91 |
-
|
92 |
-
return seg_map
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pytesseract
|
3 |
+
from pdf2image import convert_from_path
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
|
7 |
+
def ocr_pdf(pdf_file):
|
8 |
+
with tempfile.TemporaryDirectory() as path:
|
9 |
+
pdf_path = os.path.join(path, "temp.pdf")
|
10 |
+
with open(pdf_path, 'wb') as f:
|
11 |
+
f.write(pdf_file.read())
|
12 |
+
|
13 |
+
images = convert_from_path(pdf_path)
|
14 |
+
text = ""
|
15 |
+
for image in images:
|
16 |
+
text += pytesseract.image_to_string(image)
|
17 |
+
return text
|
18 |
+
|
19 |
+
iface = gr.Interface(
|
20 |
+
fn=ocr_pdf,
|
21 |
+
inputs=gr.File(label="Upload PDF", type="binary"),
|
22 |
+
outputs=gr.Textbox(label="Extracted Text"),
|
23 |
+
title="PDF OCR with PyTesseract",
|
24 |
+
description="Upload a PDF file to extract its text using PyTesseract."
|
25 |
+
)
|
26 |
+
|
27 |
+
if __name__ == "__main__":
|
28 |
+
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|