Wajeez / app.py
bushra1dajam's picture
Upload 4 files
ea88a50 verified
raw
history blame
6.9 kB
import torch
import transformers
from transformers import AutoTokenizer, AutoModel , AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
import re
import string
from nltk.corpus import stopwords
from tashaphyne.stemming import ArabicLightStemmer
import pyarabic.araby as araby
from sklearn.feature_extraction.text import TfidfVectorizer
import streamlit as st
nltk.download('punkt')
with open('tfidf_vectorizer.pkl', 'rb') as f:
vectorizer = pickle.load(f)
with open('svm_model.pkl', 'rb') as f:
model_classify = pickle.load(f)
model = AutoModelForSeq2SeqLM.from_pretrained("bushra1dajam/AraBART")
tokenizer = AutoTokenizer.from_pretrained('bushra1dajam/AraBART')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def summarize_text(text):
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
summary_ids = model.generate(
inputs["input_ids"],
max_length=512,
num_beams=8,
#no_repeat_ngram_size=4, # Prevents larger n-gram repetitions
early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def remove_numbers(text):
cleaned_text = re.sub(r'\d+', '', text)
return cleaned_text
def Removing_non_arabic(text):
text =re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9ู -ูฉ]+', ' ',text)
return text
nltk.download('stopwords')
ara_punctuations = '''`รทร—ุ›<>_()*&^%][ู€ุŒ/:"ุŸ.,'{}~ยฆ+|!โ€โ€ฆโ€œโ€“ู€''' + string.punctuation
stop_words = stopwords.words()
def remove_punctuations(text):
translator = str.maketrans('', '', ara_punctuations)
text = text.translate(translator)
return text
def remove_tashkeel(text):
text = text.strip()
text = re.sub("[ุฅุฃูฑุขุง]", "ุง", text)
text = re.sub("ู‰", "ูŠ", text)
text = re.sub("ุค", "ุก", text)
text = re.sub("ุฆ", "ุก", text)
text = re.sub("ุฉ", "ู‡", text)
noise = re.compile(""" ู‘ | # Tashdid
ูŽ | # Fatha
ู‹ | # Tanwin Fath
ู | # Damma
ูŒ | # Tanwin Damm
ู | # Kasra
ู | # Tanwin Kasr
ู’ | # Sukun
ู€ # Tatwil/Kashida
""", re.VERBOSE)
text = re.sub(noise, '', text)
text = re.sub(r'(.)\1+', r"\1\1", text)
return araby.strip_tashkeel(text)
arabic_stopwords = stopwords.words("arabic")
def remove_stop_words(text):
Text=[i for i in str(text).split() if i not in arabic_stopwords]
return " ".join(Text)
def tokenize_text(text):
tokens = word_tokenize(text)
return tokens
def Arabic_Light_Stemmer(text):
Arabic_Stemmer = ArabicLightStemmer()
text=[Arabic_Stemmer.light_stem(y) for y in text]
return " " .join(text)
def preprocess_text(text):
text = remove_numbers(text)
text = Removing_non_arabic(text)
text = remove_punctuations(text)
text = remove_stop_words(text)
text = remove_tashkeel(text)
text = tokenize_text(text)
text = Arabic_Light_Stemmer(text)
return text
class_mapping = {
0: "ุฌู†ุงุฆูŠุฉ",
1: "ุงุญูˆุงู„ ุดุฎุตูŠุฉ",
2: "ุนุงู…ุฉ"
}
st.markdown("""
<style>
body {
background-color: #f0f4f8;
direction: rtl;
font-family: 'Arial', sans-serif;
}
.logo-container {
display: flex;
justify-content: center;
align-items: center;
margin-bottom: 20px;
}
.stTextArea textarea, .stText {
text-align: right;
}
.stButton>button {
background-color: #3498db;
color: white;
font-family: 'Arial', sans-serif;
}
.stButton>button:hover {
background-color: #2980b9;
}
h1, h2, h3, h4, h5, h6, .stSubheader {
text-align: right;
}
.home-title {
text-align: center;
font-size: 40px;
color: #3498db;
}
.home-description {
text-align: center;
font-size: 20px;
color: #2c3e50;
}
.larger-text {
font-size: 24px;
color: #2c3e50;
}
</style>
""", unsafe_allow_html=True)
# Function for the Home Page
def home_page():
st.markdown('<h1 class="home-title">ู…ุฑุญุจุง ุจูƒ ููŠ ุชุทุจูŠู‚ ูˆุฌูŠุฒ</h1>', unsafe_allow_html=True)
st.markdown('<p class="home-description">ุชุทุจูŠู‚ ูˆุฌูŠุฒ ูŠู‚ุฏู… ู„ูƒ ุฎุฏู…ุฉ ุงู„ุชุตู†ูŠู ูˆุงู„ู…ู„ุฎุต ู„ู„ู†ุตูˆุต ุงู„ู‚ุงู†ูˆู†ูŠุฉ. ูŠู…ูƒู†ูƒ ุฅุฏุฎุงู„ ุงู„ู†ุตูˆุต ู‡ู†ุง ู„ู„ุญุตูˆู„ ุนู„ู‰ ุชุตู†ูŠู ุฏู‚ูŠู‚ ูˆู…ู„ุฎุต ุดุงู…ู„.</p>', unsafe_allow_html=True)
def main_page():
st.title("ุตู†ู ูˆู„ุฎุต")
# Input text area
input_text = st.text_area("ุงุฏุฎู„ ุงู„ู†ุต", "")
if st.button('ุตู†ู ูˆู„ุฎุต'):
if input_text:
prepro = preprocess_text(input_text)
features = vectorizer.transform([prepro])
prediction = model_classify.predict(features)
classifiy = prediction[0]
classifiy_class = class_mapping.get(classifiy, "ู„ู… ูŠุชู… ุงู„ุชุนุฑู")
# Generate the summarized text
summarized_text = summarize_text(input_text)
st.markdown('<p class="larger-text">ุชุตู†ูŠู ุงู„ู‚ุถูŠุฉ :</p>', unsafe_allow_html=True)
st.write(classifiy_class)
st.markdown('<p class="larger-text">ู…ู„ุฎุต ู„ู„ู‚ุถูŠุฉ :</p>', unsafe_allow_html=True)
st.write(summarized_text)
def app():
# Sidebar navigation with logo inside the sidebar
with st.sidebar:
st.markdown('<div class="logo-container">', unsafe_allow_html=True)
st.image("logo.png", width=200) # Make sure you have the logo file in your app folder
st.markdown('</div>', unsafe_allow_html=True)
st.header("ุชุทูŠูŠู‚ ูˆุฌูŠุฒ")
page_selection = st.selectbox("ุงุฎุชุฑ ุตูุญุฉ", ["ุงู„ุฑุฆูŠุณูŠุฉ", " ุตู†ู ูˆู„ุฎุต !"])
if page_selection == "ุงู„ุฑุฆูŠุณูŠุฉ":
home_page()
elif page_selection == " ุตู†ู ูˆู„ุฎุต !":
main_page()
if __name__ == "__main__":
app()