"""evaluation_comp.ipynb |
Automatically generated by Colaboratory. |
Original file is located at |
https://colab.research.google.com/drive/1qD1t_GF67fbwftmUYfuMDpwVFICPk5kJ |
""" |
!pip install gradio |
!pip install transformers |
import gradio as gr |
import pandas as pd |
from torch import nn |
from transformers import BertModel |
from transformers import BertTokenizer |
from sklearn.metrics import f1_score |
import torch |
import nltk |
nltk.download(['punkt', 'stopwords']) |
import re |
def remove_short_strings(df:pd.DataFrame, string_column:str)->pd.DataFrame: |
df[string_column] = df[string_column].astype(str) |
df['length'] = df[string_column].str.len() |
df = df.drop(df[df['length'] == 1].index) |
df = df.drop(columns=['length']) |
return df |
def remove_one_character_words(row): |
words = row['text'].split() |
return ' '.join([word for word in words if len(word) > 1]) |
def ret_list_to_str(liste): |
return " ".join (i for i in liste) |
def preprocess_tweet(tweet): |
tweet = tweet.lower() |
tweet = re.sub(r'(.)\1+', r'\1\1', tweet) |
tweet = re.sub(r'[^a-zA-ZçÇğĞıİöÖşŞüÜ\s]', '', tweet) |
tweet = re.sub(r'\s+', ' ', tweet).strip() |
return tweet |
def cleaning_stopwords(text,stop_words): |
return " ".join([word for word in str(text).split() if word not in stop_words]) |
from nltk.corpus import stopwords |
turkish_stopwords = stopwords.words('turkish') |
turkish_stopwords.append("bir") |
turkish_stopwords=set(turkish_stopwords) |
from sklearn import preprocessing |
from nltk.tokenize import word_tokenize |
def prep_and_sw_and_tokenize(df): |
turkish_stopwords = stopwords.words('turkish') |
turkish_stopwords.append("bir") |
stop_words=set(turkish_stopwords) |
df["text"]=df["text"].apply(preprocess_tweet) |
df['text'] = df["text"].apply(lambda text: cleaning_stopwords(text,stop_words)) |
return df |
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased") |
class BertClassifierConv1D(nn.Module): |
def __init__(self, dropout=0.5, num_classes=5): |
super(BertClassifierConv1D, self).__init__() |
self.bert = BertModel.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', return_dict=True) |
self.conv1d = nn.Conv1d(in_channels=self.bert.config.hidden_size, out_channels=128, kernel_size=5) |
self.bilstm = nn.LSTM(input_size=128, hidden_size=64, num_layers=1, bidirectional=True, batch_first=True) |
self.dropout = nn.Dropout(dropout) |
self.linear = nn.Linear(128, num_classes) |
def forward(self, input_id, mask): |
output = self.bert(input_ids=input_id, attention_mask=mask).last_hidden_state |
output = output.permute(0, 2, 1) |
output = self.conv1d(output) |
output, _ = self.bilstm(output.transpose(1, 2)) |
output = self.dropout(output) |
output = self.linear(output.mean(dim=1)) |
return output |
class Dataset(torch.utils.data.Dataset): |
def __init__(self, df): |
self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors="pt") for text in df] |
def __len__(self): |
return len(self.texts) |
def __getitem__(self, idx): |
batch_texts = self.texts[idx] |
return batch_texts |
def evaluate(model, test_data): |
test = Dataset(test_data) |
test_dataloader = torch.utils.data.DataLoader(test, batch_size=32) |
device= torch.device("cpu") |
total_acc_test = 0 |
output_indices = [] |
with torch.no_grad(): |
for test_input in test_dataloader: |
mask = test_input['attention_mask'].to(device) |
input_id = test_input['input_ids'].squeeze(1).to(device) |
output = model(input_id, mask) |
batch_indices = output.argmax(dim=1).tolist() |
output_indices.extend(batch_indices) |
return output_indices |
def auth(username, password): |
if username == "Hive_Hereos" and password == "Y2IB3HV8GBXED00S": |
return True |
else: |
return False |
global model |
model =BertClassifierConv1D() |
model.load_state_dict(torch.load(r"sontotalmodel_finallll.pt", map_location=torch.device('cpu'))) |
import logging |
logging.basicConfig(filename=r'app.log', filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO) |
def predict(df): |
df["offensive"] = 1 |
df["target"] = None |
try: |
text=df["text"] |
df=prep_and_sw_and_tokenize(df) |
labels = {'INSULT':0, |
'OTHER':1, |
'RACIST':3, |
'SEXIST':4 |
} |
logging.info("Başlıyoruz") |
logging.info("Model yüklendi") |
logging.info(df.text) |
a=evaluate(model, df["text"]) |
test_labels=[] |
for number in a: |
label = list(labels.keys())[list(labels.values()).index(number)] |
test_labels.append(label) |
df["target"]=test_labels |
for index, row in df.iterrows(): |
if row['target'] == 'OTHER': |
df.at[index, 'offensive'] = 0 |
df["text"]=text |
except Exception as e: |
logging.error("Error occurred", exc_info=True) |
raise e |
return df |
def get_file(file): |
output_file = "output_Hive_Hereos.csv" |
file_name = file.name.replace("\\", "/") |
df = pd.read_csv(file_name, sep="|") |
predict(df) |
df.to_csv(output_file, index=False, sep="|") |
return (output_file) |
iface = gr.Interface(get_file, "file", "file") |
if __name__ == "__main__": |
iface.launch(share=True, auth=auth,debug=True) |
iface.close() |
import session_info |
session_info.show() |