Spaces:

hHoai
/

Vietnamese_correction

Sleeping

App Files Files Community

hHoai commited on 22 days ago

Commit

ba980c3

verified ·

1 Parent(s): 52f1f7f

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -75

app.py CHANGED Viewed

@@ -1,80 +1,55 @@
 import streamlit as st
-from transformers import pipeline, AutoTokenizer
-import difflib
-# Cấu hình ứng dụng
 MAX_LENGTH = 512
-st.set_page_config(
-    page_title="Demo Correct Spelling Mistakes",
-    layout="centered",
-    initial_sidebar_state="auto"
-)
-# CSS tuỳ chỉnh cho phần highlight
-custom_css = """
-    <style>
-        .highlight {
-            color: red;
-            font-weight: bold;
-        }
-    </style>
-"""
-st.markdown(custom_css, unsafe_allow_html=True)
-st.title("Correct Spelling Mistakes App")
-# Load mô hình và tokenizer
-@st.cache_resource
-def load_model_and_tokenizer():
-    model_checkpoint = "hHoai/model_vietnamcorrection"  # Thay đổi checkpoint phù hợp
-    correct_spelling = pipeline("text2text-generation", model=model_checkpoint, tokenizer=model_checkpoint)
-    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
-    return correct_spelling, tokenizer
-correct_spelling, tokenizer = load_model_and_tokenizer()
-# Hàm nối từ thành cụm từ
-def join_tokens_as_phrases(tokens):
-    return " ".join([token.replace("▁", "") if "▁" in token else f"_{token}" for token in tokens]).strip()
-# Nhập liệu từ người dùng
-context = st.text_area("Input text", placeholder="Nhập văn bản có lỗi chính tả...")
-# Xử lý nút bấm
-if st.button("Get Result"):
-    if context.strip():
-        try:
-            # Sử dụng pipeline để sửa lỗi chính tả
-            result = correct_spelling(context, max_length=MAX_LENGTH)
-            corrected_text = result[0]['generated_text'] if result else "No output generated."
-            # Tokenize sử dụng tokenizer của bạn
-            original_tokens = tokenizer.tokenize(context)
-            corrected_tokens = tokenizer.tokenize(corrected_text)
-            # Nối các từ thành cụm từ với gạch dưới
-            original_phrases = join_tokens_as_phrases(original_tokens)
-            corrected_phrases = join_tokens_as_phrases(corrected_tokens)
-            # So sánh các từ và tìm từ thay đổi
-            def highlight_differences(original, corrected):
-                highlighted_text = []
-                matcher = difflib.SequenceMatcher(None, original.split(), corrected.split())
-                for tag, i1, i2, j1, j2 in matcher.get_opcodes():
-                    if tag == 'replace' or tag == 'insert':  # Nếu từ bị thay thế hoặc thêm
-                        highlighted_text.append(
-                            f"<span class='highlight'>{' '.join(corrected.split()[j1:j2])}</span>"
-                        )
-                    else:  # Nếu từ không thay đổi
-                        highlighted_text.append(" ".join(corrected.split()[j1:j2]))
-                return " ".join(highlighted_text)
-            # Hiển thị kết quả
-            highlighted_text = highlight_differences(original_phrases, corrected_phrases)
-            st.markdown(f"### Original Text (phrases):\n\n{original_phrases}")
-            st.markdown(f"### Corrected Text (with highlighted words):\n\n{highlighted_text}", unsafe_allow_html=True)
-        except Exception as e:
-            st.error(f"An error occurred: {e}")
     else:
-        st.warning("Please input some text to process!")

 import streamlit as st
+from transformers import pipeline
 MAX_LENGTH = 512
+# Load the spell corrector model
+corrector = pipeline("text2text-generation", model="VietAI/vit5-base-corrector")
+def find_mistake_positions(original, corrected):
+    """
+    Tìm vị trí các từ bị sửa trong câu gốc.
+    original: Câu gốc (chuỗi)
+    corrected: Câu đã được sửa (chuỗi)
+    Trả về danh sách các vị trí từ sai (start_index, end_index)
+    """
+    positions = []
+    orig_tokens = original.split()  # Tách từ câu gốc
+    corr_tokens = corrected.split()  # Tách từ câu sửa
+    orig_idx = 0  # Chỉ số ký tự bắt đầu trong câu gốc
+    for orig_word, corr_word in zip(orig_tokens, corr_tokens):
+        if orig_word != corr_word:  # Nếu từ bị thay đổi
+            start_index = original.find(orig_word, orig_idx) + 1  # Tìm vị trí từ trong câu gốc
+            end_index = start_index + len(orig_word) - 1  # Tính vị trí kết thúc
+            positions.append((start_index, end_index))  # Ghi lại (start, end)
+        orig_idx += len(orig_word) + 1  # Cập nhật chỉ số (bao gồm khoảng trắng)
+    return positions
+# Streamlit app layout
+st.title("Ứng Dụng Sửa Lỗi Chính Tả")
+st.write("Nhập văn bản và nhận kết quả sửa lỗi chính tả!")
+# Input text box for user to enter a sentence
+input_text = st.text_area("Nhập câu gốc:", "Hôm nay toi di hoc rất vui, gặp gỡ nhiêu ban be mơi.")
+if st.button("Sửa lỗi"):
+    # Perform batch prediction
+    predictions = corrector([input_text], max_length=MAX_LENGTH)
+    # Get the corrected text and find mistake positions
+    corrected_text = predictions[0]["generated_text"]
+    mistake_positions = find_mistake_positions(input_text, corrected_text)
+    # Display the results
+    st.subheader("Kết quả sửa lỗi:")
+    st.write(f"Câu gốc: {input_text}")
+    st.write(f"Câu sửa: {corrected_text}")
+    st.subheader("Vị trí các từ sai (start_index, end_index):")
+    if mistake_positions:
+        for start, end in mistake_positions:
+            st.write(f"Vị trí từ sai: {start} - {end}")
     else:
+        st.write("Không phát hiện từ sai.")