VinayHajare commited on
Commit
34bb902
โ€ข
1 Parent(s): 8a3322f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +250 -0
app.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from ocr_processor import OCRProcessor
3
+ import tempfile
4
+ import os
5
+ from PIL import Image
6
+ import json
7
+
8
+ # Page configuration
9
+ st.set_page_config(
10
+ page_title="OCR Hub",
11
+ page_icon="๐Ÿ”",
12
+ layout="wide",
13
+ initial_sidebar_state="expanded"
14
+ )
15
+
16
+ # Custom CSS for better UI
17
+ st.markdown("""
18
+ <style>
19
+ .stApp {
20
+ max-width: 100%;
21
+ padding: 1rem;
22
+ }
23
+ .main {
24
+ background-color: #f8f9fa;
25
+ }
26
+ .stButton button {
27
+ width: 100%;
28
+ border-radius: 5px;
29
+ height: 3em;
30
+ background-color: #4CAF50;
31
+ color: white;
32
+ }
33
+ .stSelectbox {
34
+ margin-bottom: 1rem;
35
+ }
36
+ .upload-text {
37
+ text-align: center;
38
+ padding: 2rem;
39
+ border: 2px dashed #ccc;
40
+ border-radius: 10px;
41
+ background-color: #ffffff;
42
+ }
43
+ .stImage {
44
+ border-radius: 10px;
45
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
46
+ }
47
+ .gallery {
48
+ display: grid;
49
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
50
+ gap: 1rem;
51
+ padding: 1rem;
52
+ }
53
+ .gallery-item {
54
+ border: 1px solid #ddd;
55
+ border-radius: 8px;
56
+ padding: 0.5rem;
57
+ background: white;
58
+ }
59
+ </style>
60
+ """, unsafe_allow_html=True)
61
+
62
+ def get_available_models():
63
+ return ["llava:7b", "MiniCPM-V","llama3.2-vision:11b"]
64
+
65
+ def process_single_image(processor, image_path, format_type, enable_preprocessing):
66
+ """Process a single image and return the result"""
67
+ try:
68
+ result = processor.process_image(
69
+ image_path=image_path,
70
+ format_type=format_type,
71
+ preprocess=enable_preprocessing
72
+ )
73
+ return result
74
+ except Exception as e:
75
+ return f"Error processing image: {str(e)}"
76
+
77
+ def process_batch_images(processor, image_paths, format_type, enable_preprocessing):
78
+ """Process multiple images and return results"""
79
+ try:
80
+ results = processor.process_batch(
81
+ input_path=image_paths,
82
+ format_type=format_type,
83
+ preprocess=enable_preprocessing
84
+ )
85
+ return results
86
+ except Exception as e:
87
+ return {"error": str(e)}
88
+
89
+ def main():
90
+ st.title("๐Ÿ” OCR Hub")
91
+ st.markdown("<p style='text-align: center; color: #666;'>Powered by Ollama Vision Models</p>", unsafe_allow_html=True)
92
+
93
+ # Sidebar controls
94
+ with st.sidebar:
95
+ st.header("๐ŸŽฎ Controls")
96
+
97
+ selected_model = st.selectbox(
98
+ "๐Ÿค– Select Vision Model",
99
+ get_available_models(),
100
+ index=0,
101
+ )
102
+
103
+ format_type = st.selectbox(
104
+ "๐Ÿ“„ Output Format",
105
+ ["markdown", "text", "json", "structured", "key_value"],
106
+ help="Choose how you want the extracted text to be formatted"
107
+ )
108
+
109
+ max_workers = st.slider(
110
+ "๐Ÿ”„ Parallel Processing",
111
+ min_value=1,
112
+ max_value=8,
113
+ value=2,
114
+ help="Number of images to process in parallel (for batch processing)"
115
+ )
116
+
117
+ enable_preprocessing = st.checkbox(
118
+ "๐Ÿ” Enable Preprocessing",
119
+ value=True,
120
+ help="Apply image enhancement and preprocessing"
121
+ )
122
+
123
+ st.markdown("---")
124
+
125
+ # Model info box
126
+ if selected_model == "llava:7b":
127
+ st.info("LLaVA 7B: Efficient vision-language model optimized for real-time processing")
128
+ elif selected_model == "MiniCPM-V":
129
+ st.info("MiniCPM-V 2.6: A GPT-4V Level MLLM for Single Image, Multi Image and Video, outperforms GPT-4o mini, Gemini 1.5 Pro and Claude 3.5 Sonnet")
130
+ else:
131
+ st.info("Llama 3.2 Vision: Advanced model with high accuracy for complex text extraction")
132
+
133
+ # Initialize OCR Processor
134
+ processor = OCRProcessor(model_name=selected_model, max_workers=max_workers)
135
+
136
+ # Main content area with tabs
137
+ tab1, tab2 = st.tabs(["๐Ÿ“ธ Image Processing", "โ„น๏ธ About"])
138
+
139
+ with tab1:
140
+ # File upload area with multiple file support
141
+ uploaded_files = st.file_uploader(
142
+ "Drop your images here",
143
+ type=['png', 'jpg', 'jpeg', 'tiff', 'bmp', 'pdf'],
144
+ accept_multiple_files=True,
145
+ help="Supported formats: PNG, JPG, JPEG, TIFF, BMP, PDF"
146
+ )
147
+
148
+ if uploaded_files:
149
+ # Create a temporary directory for uploaded files
150
+ with tempfile.TemporaryDirectory() as temp_dir:
151
+ image_paths = []
152
+
153
+ # Save uploaded files and collect paths
154
+ for uploaded_file in uploaded_files:
155
+ temp_path = os.path.join(temp_dir, uploaded_file.name)
156
+ with open(temp_path, "wb") as f:
157
+ f.write(uploaded_file.getvalue())
158
+ image_paths.append(temp_path)
159
+
160
+ # Display images in a gallery
161
+ st.subheader(f"๐Ÿ“ธ Input Images ({len(uploaded_files)} files)")
162
+ cols = st.columns(min(len(uploaded_files), 4))
163
+ for idx, uploaded_file in enumerate(uploaded_files):
164
+ with cols[idx % 4]:
165
+ image = Image.open(uploaded_file)
166
+ st.image(image, use_container_width=True, caption=uploaded_file.name)
167
+
168
+ # Process button
169
+ if st.button("๐Ÿš€ Process Images"):
170
+ with st.spinner("Processing images..."):
171
+ if len(image_paths) == 1:
172
+ # Single image processing
173
+ result = process_single_image(
174
+ processor,
175
+ image_paths[0],
176
+ format_type,
177
+ enable_preprocessing
178
+ )
179
+ st.subheader("๐Ÿ“ Extracted Text")
180
+ st.markdown(result)
181
+
182
+ # Download button for single result
183
+ st.download_button(
184
+ "๐Ÿ“ฅ Download Result",
185
+ result,
186
+ file_name=f"ocr_result.{format_type}",
187
+ mime="text/plain"
188
+ )
189
+ else:
190
+ # Batch processing
191
+ results = process_batch_images(
192
+ processor,
193
+ image_paths,
194
+ format_type,
195
+ enable_preprocessing
196
+ )
197
+
198
+ # Display statistics
199
+ st.subheader("๐Ÿ“Š Processing Statistics")
200
+ col1, col2, col3 = st.columns(3)
201
+ with col1:
202
+ st.metric("Total Images", results['statistics']['total'])
203
+ with col2:
204
+ st.metric("Successful", results['statistics']['successful'])
205
+ with col3:
206
+ st.metric("Failed", results['statistics']['failed'])
207
+
208
+ # Display results
209
+ st.subheader("๐Ÿ“ Extracted Text")
210
+ for file_path, text in results['results'].items():
211
+ with st.expander(f"Result: {os.path.basename(file_path)}"):
212
+ st.markdown(text)
213
+
214
+ # Display errors if any
215
+ if results['errors']:
216
+ st.error("โš ๏ธ Some files had errors:")
217
+ for file_path, error in results['errors'].items():
218
+ st.warning(f"{os.path.basename(file_path)}: {error}")
219
+
220
+ # Download all results as JSON
221
+ if st.button("๐Ÿ“ฅ Download All Results"):
222
+ json_results = json.dumps(results, indent=2)
223
+ st.download_button(
224
+ "๐Ÿ“ฅ Download Results JSON",
225
+ json_results,
226
+ file_name="ocr_results.json",
227
+ mime="application/json"
228
+ )
229
+
230
+ with tab2:
231
+ st.header("About OCR Hub")
232
+ st.markdown("""
233
+ This application uses state-of-the-art vision language models through Ollama to extract text from images.
234
+
235
+ ### Features:
236
+ - ๐Ÿ–ผ๏ธ Support for multiple image formats
237
+ - ๐Ÿ“ฆ Batch processing capability
238
+ - ๐Ÿ”„ Parallel processing
239
+ - ๐Ÿ” Image preprocessing and enhancement
240
+ - ๐Ÿ“Š Multiple output formats
241
+ - ๐Ÿ“ฅ Easy result download
242
+
243
+ ### Models:
244
+ - **LLaVA 7B**: Efficient vision-language model for real-time processing
245
+ - **Llama 3.2 Vision**: Advanced model with high accuracy for complex documents
246
+ - **MiniCPM-V 2.6**: Process images with any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344)
247
+ """)
248
+
249
+ if __name__ == "__main__":
250
+ main()