Spaces:

RufusRubin777
/

OCRert

Sleeping

App Files Files Community

RufusRubin777 commited on Sep 28, 2024

Commit

1ec038e

verified ·

1 Parent(s): e8b31ed

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -20

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from pathlib import Path
 import re
 import easyocr
 tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
 model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
 model = model.eval().cpu()
@@ -28,6 +29,7 @@ def image_to_base64(image):
     image.save(buffered, format="PNG")
     return base64.b64encode(buffered.getvalue()).decode()
 # @spaces.GPU
 def run_GOT(image,language):
     unique_id = str(uuid.uuid4())
@@ -58,28 +60,26 @@ def run_GOT(image,language):
         if os.path.exists(image_path):
             os.remove(image_path)
-# Updated Search Functionality
-def search_keyword(text, keyword):
-    if not keyword.strip():
-        return '<h3 style="text-align: center;">Please enter a keyword to search.</h3>'
     text_lower = text.lower()
     keyword_lower = keyword.lower()
-    if keyword_lower not in text_lower:
-        return '<h3 style="text-align: center;">Keyword not found</h3>'
-    words = re.findall(r'\b\w+\b', text)
-    highlighted_text = []
-    for word in words:
-        if keyword_lower in word.lower():
-            highlighted_word = f'<mark>{word}</mark>'
-        else:
-            highlighted_word = word
-        highlighted_text.append(highlighted_word)
-    return '<h3>' + ' '.join(highlighted_text) + '</h3>'
 def cleanup_old_files():
     current_time = time.time()
@@ -93,6 +93,7 @@ title_html = """
 <p>Scan Master uses General OCR Theory (GOT), a 580M end-to-end OCR 2.0 model for English optical character recognition and EASYOCR for Hindi optical character recognition. It supports plain text ocr.</p>
 """
 with gr.Blocks() as scan_master_web_app:
     gr.HTML(title_html)
     gr.Markdown("""
@@ -141,4 +142,4 @@ with gr.Blocks() as scan_master_web_app:
 if __name__ == "__main__":
     cleanup_old_files()
-    scan_master_web_app.launch()

 import re
 import easyocr
+# OCR Model
 tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
 model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
 model = model.eval().cpu()
     image.save(buffered, format="PNG")
     return base64.b64encode(buffered.getvalue()).decode()
 # @spaces.GPU
 def run_GOT(image,language):
     unique_id = str(uuid.uuid4())
         if os.path.exists(image_path):
             os.remove(image_path)
+# Search Functionality
+def search_keyword(text,keyword):
+    # Convert text and keyword to lowercase for case-insensitive search
     text_lower = text.lower()
     keyword_lower = keyword.lower()
+    # Keyword position in the text
+    pos = text_lower.find(keyword_lower)
+    if pos == -1:
+        ans = '<h3 style="text-align: center;">'+"Keyword not found"+'</h3>'
+    else:
+        res = [i.start() for i in re.finditer(keyword_lower, text)]
+        ans = '<h3>'
+        l = 0
+        for x in res:
+            ans += text[l:x]+'<mark>'+text[x:x+len(keyword)]+'</mark>'
+            l += len(text[l:x]+text[x:x+len(keyword)])
+        ans += text[l:]+'</h3>'
+    return ans
 def cleanup_old_files():
     current_time = time.time()
 <p>Scan Master uses General OCR Theory (GOT), a 580M end-to-end OCR 2.0 model for English optical character recognition and EASYOCR for Hindi optical character recognition. It supports plain text ocr.</p>
 """
 with gr.Blocks() as scan_master_web_app:
     gr.HTML(title_html)
     gr.Markdown("""
 if __name__ == "__main__":
     cleanup_old_files()
+    scan_master_web_app.launch()