Spaces:
Sleeping
Sleeping
import torch | |
import transformers | |
from transformers import AutoTokenizer, AutoModel , AutoModelForCausalLM | |
from transformers import AutoModelForSeq2SeqLM | |
import pickle | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import nltk | |
from nltk.tokenize import word_tokenize | |
import re | |
import string | |
from nltk.corpus import stopwords | |
from tashaphyne.stemming import ArabicLightStemmer | |
import pyarabic.araby as araby | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import streamlit as st | |
nltk.download('punkt') | |
with open('tfidf_vectorizer.pkl', 'rb') as f: | |
vectorizer = pickle.load(f) | |
with open('svm_model.pkl', 'rb') as f: | |
model_classify = pickle.load(f) | |
model = AutoModelForSeq2SeqLM.from_pretrained("bushra1dajam/AraBART") | |
tokenizer = AutoTokenizer.from_pretrained('bushra1dajam/AraBART') | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
def summarize_text(text): | |
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True) | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
summary_ids = model.generate( | |
inputs["input_ids"], | |
max_length=512, | |
num_beams=8, | |
#no_repeat_ngram_size=4, # Prevents larger n-gram repetitions | |
early_stopping=True) | |
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
return summary | |
def remove_numbers(text): | |
cleaned_text = re.sub(r'\d+', '', text) | |
return cleaned_text | |
def Removing_non_arabic(text): | |
text =re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9ู -ูฉ]+', ' ',text) | |
return text | |
nltk.download('stopwords') | |
ara_punctuations = '''`รทรุ<>_()*&^%][ูุ/:"ุ.,'{}~ยฆ+|!โโฆโโู''' + string.punctuation | |
stop_words = stopwords.words() | |
def remove_punctuations(text): | |
translator = str.maketrans('', '', ara_punctuations) | |
text = text.translate(translator) | |
return text | |
def remove_tashkeel(text): | |
text = text.strip() | |
text = re.sub("[ุฅุฃูฑุขุง]", "ุง", text) | |
text = re.sub("ู", "ู", text) | |
text = re.sub("ุค", "ุก", text) | |
text = re.sub("ุฆ", "ุก", text) | |
text = re.sub("ุฉ", "ู", text) | |
noise = re.compile(""" ู | # Tashdid | |
ู | # Fatha | |
ู | # Tanwin Fath | |
ู | # Damma | |
ู | # Tanwin Damm | |
ู | # Kasra | |
ู | # Tanwin Kasr | |
ู | # Sukun | |
ู # Tatwil/Kashida | |
""", re.VERBOSE) | |
text = re.sub(noise, '', text) | |
text = re.sub(r'(.)\1+', r"\1\1", text) | |
return araby.strip_tashkeel(text) | |
arabic_stopwords = stopwords.words("arabic") | |
def remove_stop_words(text): | |
Text=[i for i in str(text).split() if i not in arabic_stopwords] | |
return " ".join(Text) | |
def tokenize_text(text): | |
tokens = word_tokenize(text) | |
return tokens | |
def Arabic_Light_Stemmer(text): | |
Arabic_Stemmer = ArabicLightStemmer() | |
text=[Arabic_Stemmer.light_stem(y) for y in text] | |
return " " .join(text) | |
def preprocess_text(text): | |
text = remove_numbers(text) | |
text = Removing_non_arabic(text) | |
text = remove_punctuations(text) | |
text = remove_stop_words(text) | |
text = remove_tashkeel(text) | |
text = tokenize_text(text) | |
text = Arabic_Light_Stemmer(text) | |
return text | |
class_mapping = { | |
0: "ุฌูุงุฆูุฉ", | |
1: "ุงุญูุงู ุดุฎุตูุฉ", | |
2: "ุนุงู ุฉ" | |
} | |
st.markdown(""" | |
<style> | |
body { | |
background-color: #f0f4f8; | |
direction: rtl; | |
font-family: 'Arial', sans-serif; | |
} | |
.logo-container { | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
margin-bottom: 20px; | |
} | |
.stTextArea textarea, .stText { | |
text-align: right; | |
} | |
.stButton>button { | |
background-color: #3498db; | |
color: white; | |
font-family: 'Arial', sans-serif; | |
} | |
.stButton>button:hover { | |
background-color: #2980b9; | |
} | |
h1, h2, h3, h4, h5, h6, .stSubheader { | |
text-align: right; | |
} | |
.home-title { | |
text-align: center; | |
font-size: 40px; | |
color: #3498db; | |
} | |
.home-description { | |
text-align: center; | |
font-size: 20px; | |
color: #2c3e50; | |
} | |
.larger-text { | |
font-size: 24px; | |
color: #2c3e50; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Function for the Home Page | |
def home_page(): | |
st.markdown('<h1 class="home-title">ู ุฑุญุจุง ุจู ูู ุชุทุจูู ูุฌูุฒ</h1>', unsafe_allow_html=True) | |
st.markdown('<p class="home-description">ุชุทุจูู ูุฌูุฒ ููุฏู ูู ุฎุฏู ุฉ ุงูุชุตููู ูุงูู ูุฎุต ูููุตูุต ุงููุงููููุฉ. ูู ููู ุฅุฏุฎุงู ุงููุตูุต ููุง ููุญุตูู ุนูู ุชุตููู ุฏููู ูู ูุฎุต ุดุงู ู.</p>', unsafe_allow_html=True) | |
def main_page(): | |
st.title("ุตูู ููุฎุต") | |
# Input text area | |
input_text = st.text_area("ุงุฏุฎู ุงููุต", "") | |
if st.button('ุตูู ููุฎุต'): | |
if input_text: | |
prepro = preprocess_text(input_text) | |
features = vectorizer.transform([prepro]) | |
prediction = model_classify.predict(features) | |
classifiy = prediction[0] | |
classifiy_class = class_mapping.get(classifiy, "ูู ูุชู ุงูุชุนุฑู") | |
# Generate the summarized text | |
summarized_text = summarize_text(input_text) | |
st.markdown('<p class="larger-text">ุชุตููู ุงููุถูุฉ :</p>', unsafe_allow_html=True) | |
st.write(classifiy_class) | |
st.markdown('<p class="larger-text">ู ูุฎุต ูููุถูุฉ :</p>', unsafe_allow_html=True) | |
st.write(summarized_text) | |
def app(): | |
# Sidebar navigation with logo inside the sidebar | |
with st.sidebar: | |
st.markdown('<div class="logo-container">', unsafe_allow_html=True) | |
st.image("logo.png", width=200) # Make sure you have the logo file in your app folder | |
st.markdown('</div>', unsafe_allow_html=True) | |
st.header("ุชุทููู ูุฌูุฒ") | |
page_selection = st.selectbox("ุงุฎุชุฑ ุตูุญุฉ", ["ุงูุฑุฆูุณูุฉ", " ุตูู ููุฎุต !"]) | |
if page_selection == "ุงูุฑุฆูุณูุฉ": | |
home_page() | |
elif page_selection == " ุตูู ููุฎุต !": | |
main_page() | |
if __name__ == "__main__": | |
app() |