Spaces:

bushra1dajam
/

Wajeez

Sleeping

App Files Files Community

Wajeez / app.py

bushra1dajam

Update app.py

aa6b518 verified about 2 months ago

raw

history blame contribute delete

6.72 kB

	import torch
	import transformers
	from transformers import AutoTokenizer, AutoModel , AutoModelForCausalLM
	from transformers import AutoModelForSeq2SeqLM, GenerationConfig, AutoConfig ,BartForConditionalGeneration
	import pickle
	import numpy as np
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	import nltk
	from nltk.tokenize import word_tokenize
	import re
	import string
	from nltk.corpus import stopwords
	from tashaphyne.stemming import ArabicLightStemmer
	import pyarabic.araby as araby
	from sklearn.feature_extraction.text import TfidfVectorizer
	import streamlit as st
	nltk.download('punkt')



	with open('tfidf_vectorizer.pkl', 'rb') as f:
	vectorizer = pickle.load(f)

	with open('svm_model.pkl', 'rb') as f:
	model_classify = pickle.load(f)


	model = AutoModelForSeq2SeqLM.from_pretrained("bushra1dajam/Wajeez_model")
	tokenizer = AutoTokenizer.from_pretrained('bushra1dajam/Wajeez_model',use_fast=False)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	def summarize_text(text):
	inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	summary_ids = model.generate(
	inputs["input_ids"],
	max_length=512,
	num_beams=8,
	#no_repeat_ngram_size=4, # Prevents larger n-gram repetitions
	early_stopping=True)
	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
	return summary

	def remove_numbers(text):
	cleaned_text = re.sub(r'\d+', '', text)
	return cleaned_text

	def Removing_non_arabic(text):
	text =re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9٠-٩]+', ' ',text)
	return text

	nltk.download('stopwords')
	ara_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+\|!”…“–ـ''' + string.punctuation
	stop_words = stopwords.words()

	def remove_punctuations(text):
	translator = str.maketrans('', '', ara_punctuations)
	text = text.translate(translator)

	return text


	def remove_tashkeel(text):
	text = text.strip()
	text = re.sub("[إأٱآا]", "ا", text)
	text = re.sub("ى", "ي", text)
	text = re.sub("ؤ", "ء", text)
	text = re.sub("ئ", "ء", text)
	text = re.sub("ة", "ه", text)
	noise = re.compile(""" ّ \| # Tashdid
	َ \| # Fatha
	ً \| # Tanwin Fath
	ُ \| # Damma
	ٌ \| # Tanwin Damm
	ِ \| # Kasra
	ٍ \| # Tanwin Kasr
	ْ \| # Sukun
	ـ # Tatwil/Kashida
	""", re.VERBOSE)
	text = re.sub(noise, '', text)
	text = re.sub(r'(.)\1+', r"\1\1", text)
	return araby.strip_tashkeel(text)

	arabic_stopwords = stopwords.words("arabic")
	def remove_stop_words(text):
	Text=[i for i in str(text).split() if i not in arabic_stopwords]
	return " ".join(Text)

	def tokenize_text(text):
	tokens = word_tokenize(text)
	return tokens

	def Arabic_Light_Stemmer(text):

	Arabic_Stemmer = ArabicLightStemmer()
	text=[Arabic_Stemmer.light_stem(y) for y in text]

	return " " .join(text)

	def preprocess_text(text):
	text = remove_numbers(text)
	text = Removing_non_arabic(text)
	text = remove_punctuations(text)
	text = remove_stop_words(text)
	text = remove_tashkeel(text)
	text = tokenize_text(text)
	text = Arabic_Light_Stemmer(text)
	return text

	class_mapping = {
	0: "جنائية",
	1: "احوال شخصية",
	2: "عامة"
	}
	st.markdown("""
	<style>
	body {
	background-color: #f0f4f8;
	direction: rtl;
	font-family: 'Arial', sans-serif;
	}

	.logo-container {
	display: flex;
	justify-content: center;
	align-items: center;
	margin-bottom: 20px;
	}

	.stTextArea textarea, .stText {
	text-align: right;
	}

	.stButton>button {
	background-color: #3498db;
	color: white;
	font-family: 'Arial', sans-serif;
	}

	.stButton>button:hover {
	background-color: #2980b9;
	}

	h1, h2, h3, h4, h5, h6, .stSubheader {
	text-align: right;
	}

	.home-title {
	text-align: center;
	font-size: 40px;
	color: #3498db;
	}

	.home-description {
	text-align: center;
	font-size: 20px;
	color: #2c3e50;
	}

	.larger-text {
	font-size: 24px;
	color: #2c3e50;
	}
	</style>
	""", unsafe_allow_html=True)


	# Function for the Home Page
	def home_page():
	st.markdown('<h1 class="home-title">مرحبا بك في تطبيق وجيز</h1>', unsafe_allow_html=True)
	st.markdown('<p class="home-description">تطبيق وجيز يقدم لك خدمة التصنيف والملخص للنصوص القانونية. يمكنك إدخال النصوص هنا للحصول على تصنيف دقيق وملخص شامل.</p>', unsafe_allow_html=True)


	def main_page():
	st.title("صنف ولخص")

	# Input text area
	input_text = st.text_area("ادخل النص", "")

	if st.button('صنف ولخص'):
	if input_text:
	prepro = preprocess_text(input_text)
	features = vectorizer.transform([prepro])
	prediction = model_classify.predict(features)
	classifiy = prediction[0]
	classifiy_class = class_mapping.get(classifiy, "لم يتم التعرف")
	summarized_text = summarize_text(input_text)

	st.markdown('<p class="larger-text">تصنيف القضية :</p>', unsafe_allow_html=True)
	st.write(classifiy_class)

	st.markdown('<p class="larger-text">ملخص للقضية :</p>', unsafe_allow_html=True)
	st.write(summarized_text)

	def app():
	# Sidebar navigation with logo inside the sidebar
	with st.sidebar:
	st.markdown('<div class="logo-container">', unsafe_allow_html=True)
	st.image("logo.png", width=200) # Make sure you have the logo file in your app folder
	st.markdown('</div>', unsafe_allow_html=True)

	st.header("تطييق وجيز")
	page_selection = st.selectbox("اختر صفحة", ["الرئيسية", " صنف ولخص !"])

	if page_selection == "الرئيسية":
	home_page()
	elif page_selection == " صنف ولخص !":
	main_page()

	if __name__ == "__main__":
	app()