RufusRubin777 commited on
Commit
1ec038e
·
verified ·
1 Parent(s): e8b31ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -20
app.py CHANGED
@@ -11,6 +11,7 @@ from pathlib import Path
11
  import re
12
  import easyocr
13
 
 
14
  tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
15
  model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
16
  model = model.eval().cpu()
@@ -28,6 +29,7 @@ def image_to_base64(image):
28
  image.save(buffered, format="PNG")
29
  return base64.b64encode(buffered.getvalue()).decode()
30
 
 
31
  # @spaces.GPU
32
  def run_GOT(image,language):
33
  unique_id = str(uuid.uuid4())
@@ -58,28 +60,26 @@ def run_GOT(image,language):
58
  if os.path.exists(image_path):
59
  os.remove(image_path)
60
 
61
- # Updated Search Functionality
62
- def search_keyword(text, keyword):
63
- if not keyword.strip():
64
- return '<h3 style="text-align: center;">Please enter a keyword to search.</h3>'
65
-
66
  text_lower = text.lower()
67
  keyword_lower = keyword.lower()
68
 
69
- if keyword_lower not in text_lower:
70
- return '<h3 style="text-align: center;">Keyword not found</h3>'
71
-
72
- words = re.findall(r'\b\w+\b', text)
73
- highlighted_text = []
74
-
75
- for word in words:
76
- if keyword_lower in word.lower():
77
- highlighted_word = f'<mark>{word}</mark>'
78
- else:
79
- highlighted_word = word
80
- highlighted_text.append(highlighted_word)
81
-
82
- return '<h3>' + ' '.join(highlighted_text) + '</h3>'
83
 
84
  def cleanup_old_files():
85
  current_time = time.time()
@@ -93,6 +93,7 @@ title_html = """
93
  <p>Scan Master uses General OCR Theory (GOT), a 580M end-to-end OCR 2.0 model for English optical character recognition and EASYOCR for Hindi optical character recognition. It supports plain text ocr.</p>
94
  """
95
 
 
96
  with gr.Blocks() as scan_master_web_app:
97
  gr.HTML(title_html)
98
  gr.Markdown("""
@@ -141,4 +142,4 @@ with gr.Blocks() as scan_master_web_app:
141
 
142
  if __name__ == "__main__":
143
  cleanup_old_files()
144
- scan_master_web_app.launch()
 
11
  import re
12
  import easyocr
13
 
14
+ # OCR Model
15
  tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
16
  model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
17
  model = model.eval().cpu()
 
29
  image.save(buffered, format="PNG")
30
  return base64.b64encode(buffered.getvalue()).decode()
31
 
32
+
33
  # @spaces.GPU
34
  def run_GOT(image,language):
35
  unique_id = str(uuid.uuid4())
 
60
  if os.path.exists(image_path):
61
  os.remove(image_path)
62
 
63
+ # Search Functionality
64
+ def search_keyword(text,keyword):
65
+ # Convert text and keyword to lowercase for case-insensitive search
 
 
66
  text_lower = text.lower()
67
  keyword_lower = keyword.lower()
68
 
69
+ # Keyword position in the text
70
+ pos = text_lower.find(keyword_lower)
71
+
72
+ if pos == -1:
73
+ ans = '<h3 style="text-align: center;">'+"Keyword not found"+'</h3>'
74
+ else:
75
+ res = [i.start() for i in re.finditer(keyword_lower, text)]
76
+ ans = '<h3>'
77
+ l = 0
78
+ for x in res:
79
+ ans += text[l:x]+'<mark>'+text[x:x+len(keyword)]+'</mark>'
80
+ l += len(text[l:x]+text[x:x+len(keyword)])
81
+ ans += text[l:]+'</h3>'
82
+ return ans
83
 
84
  def cleanup_old_files():
85
  current_time = time.time()
 
93
  <p>Scan Master uses General OCR Theory (GOT), a 580M end-to-end OCR 2.0 model for English optical character recognition and EASYOCR for Hindi optical character recognition. It supports plain text ocr.</p>
94
  """
95
 
96
+
97
  with gr.Blocks() as scan_master_web_app:
98
  gr.HTML(title_html)
99
  gr.Markdown("""
 
142
 
143
  if __name__ == "__main__":
144
  cleanup_old_files()
145
+ scan_master_web_app.launch()