Spaces:

bushra1dajam
/

Wajeez

Sleeping

App Files Files Community

bushra1dajam commited on Nov 15, 2024

Commit

96d27f3

verified ·

1 Parent(s): 5c592c3

Update app.py

Browse files

Files changed (1) hide show

app.py +217 -217

app.py CHANGED Viewed

@@ -1,218 +1,218 @@
-import torch
-import transformers
-from transformers import AutoTokenizer, AutoModel , AutoModelForCausalLM
-from transformers import AutoModelForSeq2SeqLM
-import pickle
-import numpy as np
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-import nltk
-from nltk.tokenize import word_tokenize
-import re
-import string
-from nltk.corpus import stopwords
-from tashaphyne.stemming import ArabicLightStemmer
-import pyarabic.araby as araby
-from sklearn.feature_extraction.text import TfidfVectorizer
-import streamlit as st
-nltk.download('punkt')
-with open('tfidf_vectorizer.pkl', 'rb') as f:
-    vectorizer = pickle.load(f)
-with open('svm_model.pkl', 'rb') as f:
-    model_classify = pickle.load(f)
-model = AutoModelForSeq2SeqLM.from_pretrained("bushra1dajam/AraBART")
-tokenizer = AutoTokenizer.from_pretrained('bushra1dajam/AraBART')
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-def summarize_text(text):
-    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    summary_ids = model.generate(
-    inputs["input_ids"],
-    max_length=512,
-    num_beams=8,
-    #no_repeat_ngram_size=4,  # Prevents larger n-gram repetitions
-    early_stopping=True)
-    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    return summary
-def remove_numbers(text):
-    cleaned_text = re.sub(r'\d+', '', text)
-    return cleaned_text
-def Removing_non_arabic(text):
-    text =re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9٠-٩]+', ' ',text)
-    return text
-nltk.download('stopwords')
-ara_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation
-stop_words = stopwords.words()
-def remove_punctuations(text):
-    translator = str.maketrans('', '', ara_punctuations)
-    text = text.translate(translator)
-    return text
-def remove_tashkeel(text):
-    text = text.strip()
-    text = re.sub("[إأٱآا]", "ا", text)
-    text = re.sub("ى", "ي", text)
-    text = re.sub("ؤ", "ء", text)
-    text = re.sub("ئ", "ء", text)
-    text = re.sub("ة", "ه", text)
-    noise = re.compile(""" ّ    | # Tashdid
-                             َ    | # Fatha
-                             ً    | # Tanwin Fath
-                             ُ    | # Damma
-                             ٌ    | # Tanwin Damm
-                             ِ    | # Kasra
-                             ٍ    | # Tanwin Kasr
-                             ْ    | # Sukun
-                             ـ     # Tatwil/Kashida
-                         """, re.VERBOSE)
-    text = re.sub(noise, '', text)
-    text = re.sub(r'(.)\1+', r"\1\1", text)
-    return araby.strip_tashkeel(text)
-arabic_stopwords = stopwords.words("arabic")
-def remove_stop_words(text):
-    Text=[i for i in str(text).split() if i not in arabic_stopwords]
-    return " ".join(Text)
-def tokenize_text(text):
-    tokens = word_tokenize(text)
-    return tokens
-def Arabic_Light_Stemmer(text):
-    Arabic_Stemmer = ArabicLightStemmer()
-    text=[Arabic_Stemmer.light_stem(y) for y in text]
-    return " " .join(text)
-def preprocess_text(text):
-    text = remove_numbers(text)
-    text = Removing_non_arabic(text)
-    text = remove_punctuations(text)
-    text = remove_stop_words(text)
-    text = remove_tashkeel(text)
-    text = tokenize_text(text)
-    text = Arabic_Light_Stemmer(text)
-    return text
-class_mapping = {
-    0: "جنائية",
-    1: "احوال شخصية",
-    2: "عامة"
-}
-st.markdown("""
-    <style>
-        body {
-            background-color: #f0f4f8;
-            direction: rtl;
-            font-family: 'Arial', sans-serif;
-        }
-        .logo-container {
-            display: flex;
-            justify-content: center;
-            align-items: center;
-            margin-bottom: 20px;
-        }
-        .stTextArea textarea, .stText {
-            text-align: right;
-        }
-        .stButton>button {
-            background-color: #3498db;
-            color: white;
-            font-family: 'Arial', sans-serif;
-        }
-        .stButton>button:hover {
-            background-color: #2980b9;
-        }
-        h1, h2, h3, h4, h5, h6, .stSubheader {
-            text-align: right;
-        }
-        .home-title {
-            text-align: center;
-            font-size: 40px;
-            color: #3498db;
-        }
-        .home-description {
-            text-align: center;
-            font-size: 20px;
-            color: #2c3e50;
-        }
-        .larger-text {
-            font-size: 24px;
-            color: #2c3e50;
-        }
-    </style>
-""", unsafe_allow_html=True)
-# Function for the Home Page
-def home_page():
-    st.markdown('<h1 class="home-title">مرحبا بك في تطبيق وجيز</h1>', unsafe_allow_html=True)
-    st.markdown('<p class="home-description">تطبيق وجيز يقدم لك خدمة التصنيف والملخص للنصوص القانونية. يمكنك إدخال النصوص هنا للحصول على تصنيف دقيق وملخص شامل.</p>', unsafe_allow_html=True)
-def main_page():
-    st.title("صنف ولخص")
-    # Input text area
-    input_text = st.text_area("ادخل النص", "")
-    if st.button('صنف ولخص'):
-        if input_text:
-            prepro = preprocess_text(input_text)
-            features = vectorizer.transform([prepro])
-            prediction = model_classify.predict(features)
-            classifiy = prediction[0]
-            classifiy_class = class_mapping.get(classifiy, "لم يتم التعرف")
-            # Generate the summarized text
-            summarized_text = summarize_text(input_text)
-            st.markdown('<p class="larger-text">تصنيف القضية :</p>', unsafe_allow_html=True)
-            st.write(classifiy_class)
-            st.markdown('<p class="larger-text">ملخص للقضية :</p>', unsafe_allow_html=True)
-            st.write(summarized_text)
-def app():
-    # Sidebar navigation with logo inside the sidebar
-    with st.sidebar:
-        st.markdown('<div class="logo-container">', unsafe_allow_html=True)
-        st.image("logo.png", width=200)  # Make sure you have the logo file in your app folder
-        st.markdown('</div>', unsafe_allow_html=True)
-        st.header("تطييق وجيز")
-        page_selection = st.selectbox("اختر صفحة", ["الرئيسية", " صنف ولخص !"])
-    if page_selection == "الرئيسية":
-        home_page()
-    elif page_selection == " صنف ولخص !":
-        main_page()
-if __name__ == "__main__":
     app()

+import torch
+import transformers
+from transformers import AutoTokenizer, AutoModel , AutoModelForCausalLM
+from transformers import AutoModelForSeq2SeqLM
+import pickle
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import nltk
+from nltk.tokenize import word_tokenize
+import re
+import string
+from nltk.corpus import stopwords
+from tashaphyne.stemming import ArabicLightStemmer
+import pyarabic.araby as araby
+from sklearn.feature_extraction.text import TfidfVectorizer
+import streamlit as st
+nltk.download('punkt')
+with open('tfidf_vectorizer.pkl', 'rb') as f:
+    vectorizer = pickle.load(f)
+with open('svm_model.pkl', 'rb') as f:
+    model_classify = pickle.load(f)
+model = AutoModelForSeq2SeqLM.from_pretrained("bushra1dajam/Wajeez_model")
+tokenizer = AutoTokenizer.from_pretrained('bushra1dajam/Wajeez_model')
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+def summarize_text(text):
+    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    summary_ids = model.generate(
+    inputs["input_ids"],
+    max_length=512,
+    num_beams=8,
+    #no_repeat_ngram_size=4,  # Prevents larger n-gram repetitions
+    early_stopping=True)
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    return summary
+def remove_numbers(text):
+    cleaned_text = re.sub(r'\d+', '', text)
+    return cleaned_text
+def Removing_non_arabic(text):
+    text =re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9٠-٩]+', ' ',text)
+    return text
+nltk.download('stopwords')
+ara_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation
+stop_words = stopwords.words()
+def remove_punctuations(text):
+    translator = str.maketrans('', '', ara_punctuations)
+    text = text.translate(translator)
+    return text
+def remove_tashkeel(text):
+    text = text.strip()
+    text = re.sub("[إأٱآا]", "ا", text)
+    text = re.sub("ى", "ي", text)
+    text = re.sub("ؤ", "ء", text)
+    text = re.sub("ئ", "ء", text)
+    text = re.sub("ة", "ه", text)
+    noise = re.compile(""" ّ    | # Tashdid
+                             َ    | # Fatha
+                             ً    | # Tanwin Fath
+                             ُ    | # Damma
+                             ٌ    | # Tanwin Damm
+                             ِ    | # Kasra
+                             ٍ    | # Tanwin Kasr
+                             ْ    | # Sukun
+                             ـ     # Tatwil/Kashida
+                         """, re.VERBOSE)
+    text = re.sub(noise, '', text)
+    text = re.sub(r'(.)\1+', r"\1\1", text)
+    return araby.strip_tashkeel(text)
+arabic_stopwords = stopwords.words("arabic")
+def remove_stop_words(text):
+    Text=[i for i in str(text).split() if i not in arabic_stopwords]
+    return " ".join(Text)
+def tokenize_text(text):
+    tokens = word_tokenize(text)
+    return tokens
+def Arabic_Light_Stemmer(text):
+    Arabic_Stemmer = ArabicLightStemmer()
+    text=[Arabic_Stemmer.light_stem(y) for y in text]
+    return " " .join(text)
+def preprocess_text(text):
+    text = remove_numbers(text)
+    text = Removing_non_arabic(text)
+    text = remove_punctuations(text)
+    text = remove_stop_words(text)
+    text = remove_tashkeel(text)
+    text = tokenize_text(text)
+    text = Arabic_Light_Stemmer(text)
+    return text
+class_mapping = {
+    0: "جنائية",
+    1: "احوال شخصية",
+    2: "عامة"
+}
+st.markdown("""
+    <style>
+        body {
+            background-color: #f0f4f8;
+            direction: rtl;
+            font-family: 'Arial', sans-serif;
+        }
+        .logo-container {
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            margin-bottom: 20px;
+        }
+        .stTextArea textarea, .stText {
+            text-align: right;
+        }
+        .stButton>button {
+            background-color: #3498db;
+            color: white;
+            font-family: 'Arial', sans-serif;
+        }
+        .stButton>button:hover {
+            background-color: #2980b9;
+        }
+        h1, h2, h3, h4, h5, h6, .stSubheader {
+            text-align: right;
+        }
+        .home-title {
+            text-align: center;
+            font-size: 40px;
+            color: #3498db;
+        }
+        .home-description {
+            text-align: center;
+            font-size: 20px;
+            color: #2c3e50;
+        }
+        .larger-text {
+            font-size: 24px;
+            color: #2c3e50;
+        }
+    </style>
+""", unsafe_allow_html=True)
+# Function for the Home Page
+def home_page():
+    st.markdown('<h1 class="home-title">مرحبا بك في تطبيق وجيز</h1>', unsafe_allow_html=True)
+    st.markdown('<p class="home-description">تطبيق وجيز يقدم لك خدمة التصنيف والملخص للنصوص القانونية. يمكنك إدخال النصوص هنا للحصول على تصنيف دقيق وملخص شامل.</p>', unsafe_allow_html=True)
+def main_page():
+    st.title("صنف ولخص")
+    # Input text area
+    input_text = st.text_area("ادخل النص", "")
+    if st.button('صنف ولخص'):
+        if input_text:
+            prepro = preprocess_text(input_text)
+            features = vectorizer.transform([prepro])
+            prediction = model_classify.predict(features)
+            classifiy = prediction[0]
+            classifiy_class = class_mapping.get(classifiy, "لم يتم التعرف")
+            # Generate the summarized text
+            summarized_text = summarize_text(input_text)
+            st.markdown('<p class="larger-text">تصنيف القضية :</p>', unsafe_allow_html=True)
+            st.write(classifiy_class)
+            st.markdown('<p class="larger-text">ملخص للقضية :</p>', unsafe_allow_html=True)
+            st.write(summarized_text)
+def app():
+    # Sidebar navigation with logo inside the sidebar
+    with st.sidebar:
+        st.markdown('<div class="logo-container">', unsafe_allow_html=True)
+        st.image("logo.png", width=200)  # Make sure you have the logo file in your app folder
+        st.markdown('</div>', unsafe_allow_html=True)
+        st.header("تطييق وجيز")
+        page_selection = st.selectbox("اختر صفحة", ["الرئيسية", " صنف ولخص !"])
+    if page_selection == "الرئيسية":
+        home_page()
+    elif page_selection == " صنف ولخص !":
+        main_page()
+if __name__ == "__main__":
     app()