OCRHub / ocr_processor.py
VinayHajare's picture
Create ocr_processor.py
37e77bc verified
import json
from typing import Dict, Any, List, Union
import os
import base64
import requests
from tqdm import tqdm
import concurrent.futures
from pathlib import Path
import cv2
from pdf2image import convert_from_path
class OCRProcessor:
def __init__(self, model_name: str = "llama3.2-vision:11b",
base_url: str = "http://localhost:11434/api/generate",
max_workers: int = 1):
self.model_name = model_name
self.base_url = base_url
self.max_workers = max_workers
def _encode_image(self, image_path: str) -> str:
"""Convert image to base64 string"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def _preprocess_image(self, image_path: str) -> str:
"""
Preprocess image before OCR:
- Convert PDF to image if needed
- Auto-rotate
- Enhance contrast
- Reduce noise
"""
# Handle PDF files
if image_path.lower().endswith('.pdf'):
pages = convert_from_path(image_path)
if not pages:
raise ValueError("Could not convert PDF to image")
# Save first page as temporary image
temp_path = f"{image_path}_temp.jpg"
pages[0].save(temp_path, 'JPEG')
image_path = temp_path
# Read image
image = cv2.imread(image_path)
if image is None:
raise ValueError(f"Could not read image at {image_path}")
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Enhance contrast using CLAHE
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
enhanced = clahe.apply(gray)
# Denoise
denoised = cv2.fastNlMeansDenoising(enhanced)
# Auto-rotate if needed
# TODO: Implement rotation detection and correction
# Save preprocessed image
preprocessed_path = f"{image_path}_preprocessed.jpg"
cv2.imwrite(preprocessed_path, denoised)
return preprocessed_path
def process_image(self, image_path: str, format_type: str = "markdown", preprocess: bool = True) -> str:
"""
Process an image and extract text in the specified format
Args:
image_path: Path to the image file
format_type: One of ["markdown", "text", "json", "structured", "key_value"]
preprocess: Whether to apply image preprocessing
"""
try:
if preprocess:
image_path = self._preprocess_image(image_path)
image_base64 = self._encode_image(image_path)
# Clean up temporary files
if image_path.endswith(('_preprocessed.jpg', '_temp.jpg')):
os.remove(image_path)
# Generic prompt templates for different formats
prompts = {
"markdown": """Please look at this image and extract all the text content. Format the output in markdown:
- Use headers (# ## ###) for titles and sections
- Use bullet points (-) for lists
- Use proper markdown formatting for emphasis and structure
- Preserve the original text hierarchy and formatting as much as possible""",
"text": """Please look at this image and extract all the text content.
Provide the output as plain text, maintaining the original layout and line breaks where appropriate.
Include all visible text from the image.""",
"json": """Please look at this image and extract all the text content. Structure the output as JSON with these guidelines:
- Identify different sections or components
- Use appropriate keys for different text elements
- Maintain the hierarchical structure of the content
- Include all visible text from the image""",
"structured": """Please look at this image and extract all the text content, focusing on structural elements:
- Identify and format any tables
- Extract lists and maintain their structure
- Preserve any hierarchical relationships
- Format sections and subsections clearly""",
"key_value": """Please look at this image and extract text that appears in key-value pairs:
- Look for labels and their associated values
- Extract form fields and their contents
- Identify any paired information
- Present each pair on a new line as 'key: value'"""
}
# Get the appropriate prompt
prompt = prompts.get(format_type, prompts["text"])
# Prepare the request payload
payload = {
"model": self.model_name,
"prompt": prompt,
"stream": False,
"images": [image_base64]
}
# Make the API call to Ollama
response = requests.post(self.base_url, json=payload)
response.raise_for_status() # Raise an exception for bad status codes
result = response.json().get("response", "")
# Clean up the result if needed
if format_type == "json":
try:
# Try to parse and re-format JSON if it's valid
json_data = json.loads(result)
return json.dumps(json_data, indent=2)
except json.JSONDecodeError:
# If JSON parsing fails, return the raw result
return result
return result
except Exception as e:
return f"Error processing image: {str(e)}"
def process_batch(
self,
input_path: Union[str, List[str]],
format_type: str = "markdown",
recursive: bool = False,
preprocess: bool = True
) -> Dict[str, Any]:
"""
Process multiple images in batch
Args:
input_path: Path to directory or list of image paths
format_type: Output format type
recursive: Whether to search directories recursively
preprocess: Whether to apply image preprocessing
Returns:
Dictionary with results and statistics
"""
# Collect all image paths
image_paths = []
if isinstance(input_path, str):
base_path = Path(input_path)
if base_path.is_dir():
pattern = '**/*' if recursive else '*'
for ext in ['.png', '.jpg', '.jpeg', '.pdf', '.tiff']:
image_paths.extend(base_path.glob(f'{pattern}{ext}'))
else:
image_paths = [base_path]
else:
image_paths = [Path(p) for p in input_path]
results = {}
errors = {}
# Process images in parallel with progress bar
with tqdm(total=len(image_paths), desc="Processing images") as pbar:
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
future_to_path = {
executor.submit(self.process_image, str(path), format_type, preprocess): path
for path in image_paths
}
for future in concurrent.futures.as_completed(future_to_path):
path = future_to_path[future]
try:
results[str(path)] = future.result()
except Exception as e:
errors[str(path)] = str(e)
pbar.update(1)
return {
"results": results,
"errors": errors,
"statistics": {
"total": len(image_paths),
"successful": len(results),
"failed": len(errors)
}
}