Spaces:

hHoai
/

Vietnamese_correction

Sleeping

App Files Files Community

hHoai commited on 22 days ago

Commit

52f1f7f

verified ·

1 Parent(s): 66031eb

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -16

app.py CHANGED Viewed

@@ -33,6 +33,10 @@ def load_model_and_tokenizer():
 correct_spelling, tokenizer = load_model_and_tokenizer()
 # Nhập liệu từ người dùng
 context = st.text_area("Input text", placeholder="Nhập văn bản có lỗi chính tả...")
@@ -48,31 +52,28 @@ if st.button("Get Result"):
             original_tokens = tokenizer.tokenize(context)
             corrected_tokens = tokenizer.tokenize(corrected_text)
             # So sánh các từ và tìm từ thay đổi
             def highlight_differences(original, corrected):
                 highlighted_text = []
-                modified_indices = []
-                matcher = difflib.SequenceMatcher(None, original, corrected)
                 for tag, i1, i2, j1, j2 in matcher.get_opcodes():
-                    if tag == 'replace':  # Nếu từ bị thay thế
-                        for word in corrected[j1:j2]:
-                            highlighted_text.append(f"<span class='highlight'>{word}</span>")  # Bôi đỏ từ đã sửa
-                            modified_indices.extend(range(j1, j2))
-                    elif tag == 'insert':  # Nếu từ mới được thêm
-                        for word in corrected[j1:j2]:
-                            highlighted_text.append(f"<span class='highlight'>{word}</span>")
-                            modified_indices.extend(range(j1, j2))
                     else:  # Nếu từ không thay đổi
-                        highlighted_text.extend(corrected[j1:j2])
-                return " ".join(highlighted_text), modified_indices
-            # Lấy kết quả đã chỉnh sửa và vị trí các từ đã sửa
-            highlighted_text, modified_indices = highlight_differences(original_tokens, corrected_tokens)
             # Hiển thị kết quả
             st.markdown(f"### Corrected Text (with highlighted words):\n\n{highlighted_text}", unsafe_allow_html=True)
-            st.markdown(f"### Modified Word Indices:\n\n{modified_indices}")
         except Exception as e:
             st.error(f"An error occurred: {e}")
     else:

 correct_spelling, tokenizer = load_model_and_tokenizer()
+# Hàm nối từ thành cụm từ
+def join_tokens_as_phrases(tokens):
+    return " ".join([token.replace("▁", "") if "▁" in token else f"_{token}" for token in tokens]).strip()
 # Nhập liệu từ người dùng
 context = st.text_area("Input text", placeholder="Nhập văn bản có lỗi chính tả...")
             original_tokens = tokenizer.tokenize(context)
             corrected_tokens = tokenizer.tokenize(corrected_text)
+            # Nối các từ thành cụm từ với gạch dưới
+            original_phrases = join_tokens_as_phrases(original_tokens)
+            corrected_phrases = join_tokens_as_phrases(corrected_tokens)
             # So sánh các từ và tìm từ thay đổi
             def highlight_differences(original, corrected):
                 highlighted_text = []
+                matcher = difflib.SequenceMatcher(None, original.split(), corrected.split())
                 for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+                    if tag == 'replace' or tag == 'insert':  # Nếu từ bị thay thế hoặc thêm
+                        highlighted_text.append(
+                            f"<span class='highlight'>{' '.join(corrected.split()[j1:j2])}</span>"
+                        )
                     else:  # Nếu từ không thay đổi
+                        highlighted_text.append(" ".join(corrected.split()[j1:j2]))
+                return " ".join(highlighted_text)
             # Hiển thị kết quả
+            highlighted_text = highlight_differences(original_phrases, corrected_phrases)
+            st.markdown(f"### Original Text (phrases):\n\n{original_phrases}")
             st.markdown(f"### Corrected Text (with highlighted words):\n\n{highlighted_text}", unsafe_allow_html=True)
         except Exception as e:
             st.error(f"An error occurred: {e}")
     else: