|
|
|
"""evaluation_comp.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1qD1t_GF67fbwftmUYfuMDpwVFICPk5kJ |
|
""" |
|
|
|
!pip install gradio |
|
|
|
!pip install transformers |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
from torch import nn |
|
from transformers import BertModel |
|
from transformers import BertTokenizer |
|
from sklearn.metrics import f1_score |
|
import torch |
|
import nltk |
|
nltk.download(['punkt', 'stopwords']) |
|
import re |
|
|
|
def remove_short_strings(df:pd.DataFrame, string_column:str)->pd.DataFrame: |
|
df[string_column] = df[string_column].astype(str) |
|
df['length'] = df[string_column].str.len() |
|
df = df.drop(df[df['length'] == 1].index) |
|
df = df.drop(columns=['length']) |
|
return df |
|
def remove_one_character_words(row): |
|
words = row['text'].split() |
|
return ' '.join([word for word in words if len(word) > 1]) |
|
def ret_list_to_str(liste): |
|
return " ".join (i for i in liste) |
|
def preprocess_tweet(tweet): |
|
|
|
tweet = tweet.lower() |
|
|
|
tweet = re.sub(r'(.)\1+', r'\1\1', tweet) |
|
|
|
tweet = re.sub(r'[^a-zA-ZçÇğĞıİöÖşŞüÜ\s]', '', tweet) |
|
|
|
tweet = re.sub(r'\s+', ' ', tweet).strip() |
|
return tweet |
|
def cleaning_stopwords(text,stop_words): |
|
return " ".join([word for word in str(text).split() if word not in stop_words]) |
|
from nltk.corpus import stopwords |
|
|
|
turkish_stopwords = stopwords.words('turkish') |
|
turkish_stopwords.append("bir") |
|
turkish_stopwords=set(turkish_stopwords) |
|
|
|
|
|
|
|
from sklearn import preprocessing |
|
from nltk.tokenize import word_tokenize |
|
|
|
|
|
def prep_and_sw_and_tokenize(df): |
|
|
|
turkish_stopwords = stopwords.words('turkish') |
|
turkish_stopwords.append("bir") |
|
stop_words=set(turkish_stopwords) |
|
df["text"]=df["text"].apply(preprocess_tweet) |
|
df['text'] = df["text"].apply(lambda text: cleaning_stopwords(text,stop_words)) |
|
|
|
|
|
|
|
|
|
return df |
|
|
|
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased") |
|
class BertClassifierConv1D(nn.Module): |
|
def __init__(self, dropout=0.5, num_classes=5): |
|
super(BertClassifierConv1D, self).__init__() |
|
|
|
self.bert = BertModel.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', return_dict=True) |
|
self.conv1d = nn.Conv1d(in_channels=self.bert.config.hidden_size, out_channels=128, kernel_size=5) |
|
self.bilstm = nn.LSTM(input_size=128, hidden_size=64, num_layers=1, bidirectional=True, batch_first=True) |
|
self.dropout = nn.Dropout(dropout) |
|
self.linear = nn.Linear(128, num_classes) |
|
|
|
def forward(self, input_id, mask): |
|
output = self.bert(input_ids=input_id, attention_mask=mask).last_hidden_state |
|
output = output.permute(0, 2, 1) |
|
output = self.conv1d(output) |
|
output, _ = self.bilstm(output.transpose(1, 2)) |
|
output = self.dropout(output) |
|
output = self.linear(output.mean(dim=1)) |
|
return output |
|
class Dataset(torch.utils.data.Dataset): |
|
def __init__(self, df): |
|
self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors="pt") for text in df] |
|
|
|
def __len__(self): |
|
return len(self.texts) |
|
|
|
def __getitem__(self, idx): |
|
batch_texts = self.texts[idx] |
|
return batch_texts |
|
def evaluate(model, test_data): |
|
|
|
test = Dataset(test_data) |
|
|
|
test_dataloader = torch.utils.data.DataLoader(test, batch_size=32) |
|
|
|
|
|
|
|
device= torch.device("cpu") |
|
|
|
|
|
|
|
|
|
|
|
total_acc_test = 0 |
|
output_indices = [] |
|
with torch.no_grad(): |
|
|
|
for test_input in test_dataloader: |
|
|
|
mask = test_input['attention_mask'].to(device) |
|
input_id = test_input['input_ids'].squeeze(1).to(device) |
|
|
|
output = model(input_id, mask) |
|
|
|
|
|
batch_indices = output.argmax(dim=1).tolist() |
|
output_indices.extend(batch_indices) |
|
|
|
|
|
|
|
return output_indices |
|
|
|
def auth(username, password): |
|
if username == "Hive_Hereos" and password == "Y2IB3HV8GBXED00S": |
|
return True |
|
else: |
|
return False |
|
|
|
global model |
|
model =BertClassifierConv1D() |
|
|
|
model.load_state_dict(torch.load(r"sontotalmodel_finallll.pt", map_location=torch.device('cpu'))) |
|
|
|
import logging |
|
logging.basicConfig(filename=r'app.log', filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO) |
|
|
|
|
|
def predict(df): |
|
|
|
df["offensive"] = 1 |
|
df["target"] = None |
|
|
|
try: |
|
|
|
text=df["text"] |
|
df=prep_and_sw_and_tokenize(df) |
|
|
|
labels = {'INSULT':0, |
|
'OTHER':1, |
|
'PROFANITY':2, |
|
'RACIST':3, |
|
'SEXIST':4 |
|
} |
|
logging.info("Başlıyoruz") |
|
|
|
logging.info("Model yüklendi") |
|
logging.info(df.text) |
|
a=evaluate(model, df["text"]) |
|
|
|
test_labels=[] |
|
for number in a: |
|
label = list(labels.keys())[list(labels.values()).index(number)] |
|
test_labels.append(label) |
|
df["target"]=test_labels |
|
|
|
for index, row in df.iterrows(): |
|
if row['target'] == 'OTHER': |
|
df.at[index, 'offensive'] = 0 |
|
df["text"]=text |
|
except Exception as e: |
|
logging.error("Error occurred", exc_info=True) |
|
raise e |
|
|
|
|
|
|
|
|
|
return df |
|
|
|
def get_file(file): |
|
output_file = "output_Hive_Hereos.csv" |
|
|
|
|
|
file_name = file.name.replace("\\", "/") |
|
|
|
df = pd.read_csv(file_name, sep="|") |
|
|
|
predict(df) |
|
df.to_csv(output_file, index=False, sep="|") |
|
return (output_file) |
|
|
|
|
|
iface = gr.Interface(get_file, "file", "file") |
|
|
|
if __name__ == "__main__": |
|
iface.launch(share=True, auth=auth,debug=True) |
|
|
|
iface.close() |
|
|
|
import session_info |
|
session_info.show() |
|
|
|
|