import streamlit as st from ocr_processor import OCRProcessor import tempfile import os from PIL import Image import json # Page configuration st.set_page_config( page_title="OCR Hub", page_icon="🔍", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for better UI st.markdown(""" """, unsafe_allow_html=True) def get_available_models(): return ["llava:7b", "MiniCPM-V","llama3.2-vision:11b"] def process_single_image(processor, image_path, format_type, enable_preprocessing): """Process a single image and return the result""" try: result = processor.process_image( image_path=image_path, format_type=format_type, preprocess=enable_preprocessing ) return result except Exception as e: return f"Error processing image: {str(e)}" def process_batch_images(processor, image_paths, format_type, enable_preprocessing): """Process multiple images and return results""" try: results = processor.process_batch( input_path=image_paths, format_type=format_type, preprocess=enable_preprocessing ) return results except Exception as e: return {"error": str(e)} def main(): st.title("🔍 OCR Hub") st.markdown("

Powered by Ollama Vision Models

", unsafe_allow_html=True) # Sidebar controls with st.sidebar: st.header("🎮 Controls") selected_model = st.selectbox( "🤖 Select Vision Model", get_available_models(), index=0, ) format_type = st.selectbox( "📄 Output Format", ["markdown", "text", "json", "structured", "key_value"], help="Choose how you want the extracted text to be formatted" ) max_workers = st.slider( "🔄 Parallel Processing", min_value=1, max_value=8, value=2, help="Number of images to process in parallel (for batch processing)" ) enable_preprocessing = st.checkbox( "🔍 Enable Preprocessing", value=True, help="Apply image enhancement and preprocessing" ) st.markdown("---") # Model info box if selected_model == "llava:7b": st.info("LLaVA 7B: Efficient vision-language model optimized for real-time processing") elif selected_model == "MiniCPM-V": st.info("MiniCPM-V 2.6: A GPT-4V Level MLLM for Single Image, Multi Image and Video, outperforms GPT-4o mini, Gemini 1.5 Pro and Claude 3.5 Sonnet") else: st.info("Llama 3.2 Vision: Advanced model with high accuracy for complex text extraction") # Initialize OCR Processor processor = OCRProcessor(model_name=selected_model, max_workers=max_workers) # Main content area with tabs tab1, tab2 = st.tabs(["📸 Image Processing", "ℹī¸ About"]) with tab1: # File upload area with multiple file support uploaded_files = st.file_uploader( "Drop your images here", type=['png', 'jpg', 'jpeg', 'tiff', 'bmp', 'pdf'], accept_multiple_files=True, help="Supported formats: PNG, JPG, JPEG, TIFF, BMP, PDF" ) if uploaded_files: # Create a temporary directory for uploaded files with tempfile.TemporaryDirectory() as temp_dir: image_paths = [] # Save uploaded files and collect paths for uploaded_file in uploaded_files: temp_path = os.path.join(temp_dir, uploaded_file.name) with open(temp_path, "wb") as f: f.write(uploaded_file.getvalue()) image_paths.append(temp_path) # Display images in a gallery st.subheader(f"📸 Input Images ({len(uploaded_files)} files)") cols = st.columns(min(len(uploaded_files), 4)) for idx, uploaded_file in enumerate(uploaded_files): with cols[idx % 4]: image = Image.open(uploaded_file) st.image(image, use_container_width=True, caption=uploaded_file.name) # Process button if st.button("🚀 Process Images"): with st.spinner("Processing images..."): if len(image_paths) == 1: # Single image processing result = process_single_image( processor, image_paths[0], format_type, enable_preprocessing ) st.subheader("📝 Extracted Text") st.markdown(result) # Download button for single result st.download_button( "đŸ“Ĩ Download Result", result, file_name=f"ocr_result.{format_type}", mime="text/plain" ) else: # Batch processing results = process_batch_images( processor, image_paths, format_type, enable_preprocessing ) # Display statistics st.subheader("📊 Processing Statistics") col1, col2, col3 = st.columns(3) with col1: st.metric("Total Images", results['statistics']['total']) with col2: st.metric("Successful", results['statistics']['successful']) with col3: st.metric("Failed", results['statistics']['failed']) # Display results st.subheader("📝 Extracted Text") for file_path, text in results['results'].items(): with st.expander(f"Result: {os.path.basename(file_path)}"): st.markdown(text) # Display errors if any if results['errors']: st.error("⚠ī¸ Some files had errors:") for file_path, error in results['errors'].items(): st.warning(f"{os.path.basename(file_path)}: {error}") # Download all results as JSON if st.button("đŸ“Ĩ Download All Results"): json_results = json.dumps(results, indent=2) st.download_button( "đŸ“Ĩ Download Results JSON", json_results, file_name="ocr_results.json", mime="application/json" ) with tab2: st.header("About OCR Hub") st.markdown(""" This application uses state-of-the-art vision language models through Ollama to extract text from images. ### Features: - đŸ–ŧī¸ Support for multiple image formats - đŸ“Ļ Batch processing capability - 🔄 Parallel processing - 🔍 Image preprocessing and enhancement - 📊 Multiple output formats - đŸ“Ĩ Easy result download ### Models: - **LLaVA 7B**: Efficient vision-language model for real-time processing - **Llama 3.2 Vision**: Advanced model with high accuracy for complex documents - **MiniCPM-V 2.6**: Process images with any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344) """) if __name__ == "__main__": main()