Pytesseract-PytesseractJs-LLM-OCR

Sleeping

App Files Files Community

Luke commited on Jul 29, 2024

Commit

e651999

1 Parent(s): 7336a40

no message

Browse files

Files changed (7) hide show

IdentifyModel/cardModel.py +26 -0
Plan/AiLLM.py +16 -0
Plan/pytesseractOCR.py +27 -0
Preprocess/preprocessImg.py +19 -0
app.py +49 -22
eurotext.png +0 -0
requirements.txt +6 -1

IdentifyModel/cardModel.py ADDED Viewed

	@@ -0,0 +1,26 @@

+def parse_id_card(text, validation_type, entities=None):
+    if validation_type == "身分證正面":
+        result = {
+            "解析全文內容": text,
+            "姓名": entities.get('B-PER', '無法解析') if entities else '無法解析',
+            "出生年月日": entities.get('B-DATE', '無法解析') if entities else '無法解析',
+            "發證日期": entities.get('I-DATE', '無法解析') if entities else '無法解析',
+            "統一編號": entities.get('B-NUM', '無法解析') if entities else '無法解析'
+        }
+    elif validation_type == "身分證反面":
+        result = {
+            "解析全文內容": text,
+            "父": entities.get('B-FATHER', '無法解析') if entities else '無法解析',
+            "母": entities.get('B-MOTHER', '無法解析') if entities else '無法解析',
+            "配偶": entities.get('B-SPOUSE', '無法解析') if entities else '無法解析',
+            "出生地": entities.get('B-LOC', '無法解析') if entities else '無法解析',
+            "住址": entities.get('I-LOC', '無法解析') if entities else '無法解析',
+            "編號": entities.get('B-ID', '無法解析') if entities else '無法解析'
+        }
+    else:
+        result = {
+            "解析全文內容": text,
+        }
+    return result

Plan/AiLLM.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+import pytesseract
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+from IdentifyModel.cardModel import parse_id_card
+# 初始化 Taiwanese BERT 模型
+tokenizer = AutoTokenizer.from_pretrained("ckiplab/bert-base-chinese")
+model = AutoModelForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")
+ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
+def llm_recognition(image, validation_type, language):
+    text = pytesseract.image_to_string(image, lang=language)
+    ner_results = ner_pipeline(text)
+    entities = {result['entity']: text[result['start']:result['end']] for result in ner_results}
+    return parse_id_card(text, validation_type, entities)

Plan/pytesseractOCR.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# import cv2
+import os
+import pytesseract
+from IdentifyModel.cardModel import parse_id_card
+from Preprocess.preprocessImg import preprocess_image001
+def ocr_recognition(image, validation_type, language):
+    try:
+        preprocessed_image = preprocess_image001(image)
+        custom_config = r'--oem 3 --psm 6'
+        text = pytesseract.image_to_string(preprocessed_image, lang=language, config=custom_config)
+        return parse_id_card(text, validation_type)
+    except Exception as e:
+        return str(e)
+# def ocr_recognition_2(image: str, lang: str = 'chi_tra') -> str:
+#     try:
+#         img = cv2.imread(image)
+#         gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+#         threshold_img = cv2.threshold(gray, 127, 255, cv2.THRESH_TOZERO)[1]
+#         result = pytesseract.image_to_string(threshold_img, lang=lang)
+#         os.remove(image)
+#         return result
+#     except Exception as e:
+#         return str(e)

Preprocess/preprocessImg.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import cv2
+import numpy as np
+from PIL import Image, ImageEnhance
+# 方案一
+def preprocess_image001(image):
+    # 將影像轉換為 NumPy 數組
+    image = np.array(image)
+    # 轉為灰階影像
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    # 調整對比度
+    enhancer = ImageEnhance.Contrast(Image.fromarray(gray))
+    enhanced_image = enhancer.enhance(2)
+    # 二值化
+    _, binary = cv2.threshold(np.array(enhanced_image), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    # 去雜訊
+    denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
+    return Image.fromarray(denoised)

app.py CHANGED Viewed

@@ -1,13 +1,15 @@
-from PIL import Image
-import pytesseract
-import gradio as gr
 import os
 langs = []
 choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
-blocks = gr.Blocks()
 # If you don't have tesseract executable in your PATH, include the following:
 # pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
@@ -29,24 +31,49 @@ blocks = gr.Blocks()
 # print(pytesseract.image_to_osd(Image.open('test.png'))
-def run(image, lang=None):
-    result = pytesseract.image_to_string(
-        image, lang=None if lang == [] else lang)
-    return result
 with gr.Blocks() as demo:
-    gr.Markdown("## Hello pytesseract!")
     with gr.Row():
-        with gr.Column():
-            image_in = gr.Image(type="pil")
-            lang = gr.Dropdown(choices)
-            btn = gr.Button("Run")
-        with gr.Column():
-            text_out = gr.TextArea()
-    examples = gr.Examples([["./eurotext.png", None]], fn=run, inputs=[
-                           image_in, lang], outputs=[text_out], cache_examples=False)
-    btn.click(fn=run, inputs=[image_in, lang], outputs=[text_out])
-demo.launch()

 import os
+import gradio as gr
+from Plan.AiLLM import llm_recognition
+from Plan.pytesseractOCR import ocr_recognition
+from Preprocess.preprocessImg import preprocess_image001
 langs = []
 choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
 # If you don't have tesseract executable in your PATH, include the following:
 # pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
 # print(pytesseract.image_to_osd(Image.open('test.png'))
+# 取得所有語言清單
+languages = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
+print(' ======================================================== ')
+print(' ###### choices:' + choices)
+print(' ###### GET ENV - TESSDATA_PREFIX:' + os.getenv('TESSDATA_PREFIX'))
+print(' ###### OS - TESSDATA_PREFIX:' + os.environ['TESSDATA_PREFIX'])
+# os.environ['TESSDATA_PREFIX'] = os.getenv('TESSDATA_PREFIX')
+print(' ###### Tesseract_Cmd:' + pytesseract.pytesseract.tesseract_cmd)
+# pytesseract.pytesseract.tesseract_cmd = os.getenv('TESSDATA_PREFIX')
+print(' ======================================================== ')
+def preprocess_and_ocr(image, validation_type, language):
+    preprocessed_image = preprocess_image001(image)
+    ocr_result = ocr_recognition(preprocessed_image, validation_type, language)
+    return preprocessed_image, ocr_result
+def preprocess_and_llm(image, validation_type, language):
+    preprocessed_image = preprocess_image001(image)
+    llm_result = llm_recognition(preprocessed_image, validation_type, language)
+    return preprocessed_image, llm_result
 with gr.Blocks() as demo:
     with gr.Row():
+        image_input = gr.Image(type="pil", label="上傳圖片")
+        validation_type = gr.Dropdown(choices=["身分證正面", "身分證反面"], label="驗證類別")
+        language_dropdown = gr.Dropdown(choices=languages, value="chi_tra", label="語言")
+    with gr.Row():
+        ocr_button = gr.Button("使用 OCR")
+        llm_button = gr.Button("使用 AI LLM")
+    with gr.Row():
+        preprocess_output = gr.Image(label="OCR 預處理圖片")
+    with gr.Row():
+        ocr_output = gr.JSON(label="OCR 解析結果")
+        llm_output = gr.JSON(label="AI LLM 解析結果")
+    ocr_button.click(preprocess_and_ocr, inputs=[image_input, validation_type, language_dropdown], outputs=ocr_output)
+    llm_button.click(preprocess_and_llm, inputs=[image_input, validation_type, language_dropdown], outputs=llm_output)
+demo.launch(share=False)

eurotext.png DELETED Viewed

Binary file (31.5 kB)

requirements.txt CHANGED Viewed

@@ -1,2 +1,7 @@
 gradio
-pytesseract

 gradio
+pytesseract
+transformers
+Pillow
+torch
+huggingface-hub
+opencv-python