import torch import transformers from transformers import AutoTokenizer, AutoModel , AutoModelForCausalLM from transformers import AutoModelForSeq2SeqLM, GenerationConfig, AutoConfig ,BartForConditionalGeneration import pickle import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import nltk from nltk.tokenize import word_tokenize import re import string from nltk.corpus import stopwords from tashaphyne.stemming import ArabicLightStemmer import pyarabic.araby as araby from sklearn.feature_extraction.text import TfidfVectorizer import streamlit as st nltk.download('punkt') with open('tfidf_vectorizer.pkl', 'rb') as f: vectorizer = pickle.load(f) with open('svm_model.pkl', 'rb') as f: model_classify = pickle.load(f) model = AutoModelForSeq2SeqLM.from_pretrained("bushra1dajam/Wajeez_model") tokenizer = AutoTokenizer.from_pretrained('bushra1dajam/Wajeez_model',use_fast=False) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def summarize_text(text): inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True) inputs = {k: v.to(device) for k, v in inputs.items()} summary_ids = model.generate( inputs["input_ids"], max_length=512, num_beams=8, #no_repeat_ngram_size=4, # Prevents larger n-gram repetitions early_stopping=True) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary def remove_numbers(text): cleaned_text = re.sub(r'\d+', '', text) return cleaned_text def Removing_non_arabic(text): text =re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9٠-٩]+', ' ',text) return text nltk.download('stopwords') ara_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation stop_words = stopwords.words() def remove_punctuations(text): translator = str.maketrans('', '', ara_punctuations) text = text.translate(translator) return text def remove_tashkeel(text): text = text.strip() text = re.sub("[إأٱآا]", "ا", text) text = re.sub("ى", "ي", text) text = re.sub("ؤ", "ء", text) text = re.sub("ئ", "ء", text) text = re.sub("ة", "ه", text) noise = re.compile(""" ّ | # Tashdid َ | # Fatha ً | # Tanwin Fath ُ | # Damma ٌ | # Tanwin Damm ِ | # Kasra ٍ | # Tanwin Kasr ْ | # Sukun ـ # Tatwil/Kashida """, re.VERBOSE) text = re.sub(noise, '', text) text = re.sub(r'(.)\1+', r"\1\1", text) return araby.strip_tashkeel(text) arabic_stopwords = stopwords.words("arabic") def remove_stop_words(text): Text=[i for i in str(text).split() if i not in arabic_stopwords] return " ".join(Text) def tokenize_text(text): tokens = word_tokenize(text) return tokens def Arabic_Light_Stemmer(text): Arabic_Stemmer = ArabicLightStemmer() text=[Arabic_Stemmer.light_stem(y) for y in text] return " " .join(text) def preprocess_text(text): text = remove_numbers(text) text = Removing_non_arabic(text) text = remove_punctuations(text) text = remove_stop_words(text) text = remove_tashkeel(text) text = tokenize_text(text) text = Arabic_Light_Stemmer(text) return text class_mapping = { 0: "جنائية", 1: "احوال شخصية", 2: "عامة" } st.markdown(""" """, unsafe_allow_html=True) # Function for the Home Page def home_page(): st.markdown('
تطبيق وجيز يقدم لك خدمة التصنيف والملخص للنصوص القانونية. يمكنك إدخال النصوص هنا للحصول على تصنيف دقيق وملخص شامل.
', unsafe_allow_html=True) def main_page(): st.title("صنف ولخص") # Input text area input_text = st.text_area("ادخل النص", "") if st.button('صنف ولخص'): if input_text: prepro = preprocess_text(input_text) features = vectorizer.transform([prepro]) prediction = model_classify.predict(features) classifiy = prediction[0] classifiy_class = class_mapping.get(classifiy, "لم يتم التعرف") summarized_text = summarize_text(input_text) st.markdown('تصنيف القضية :
', unsafe_allow_html=True) st.write(classifiy_class) st.markdown('ملخص للقضية :
', unsafe_allow_html=True) st.write(summarized_text) def app(): # Sidebar navigation with logo inside the sidebar with st.sidebar: st.markdown('