|
|
|
"""FINALberturk_ensemble.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1yAhhmVl42CAD5BCvUCtjMO7utTU2cGqE |
|
""" |
|
|
|
!pip install transformers |
|
|
|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
|
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
|
|
import random |
|
import warnings |
|
import time |
|
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
pd.set_option('display.max_colwidth', None) |
|
|
|
|
|
import tensorflow as tf |
|
from tensorflow.keras.layers import Add, GlobalAvgPool1D, MaxPool1D, Activation, BatchNormalization, Embedding, LSTM, Dense, Bidirectional, Input, SpatialDropout1D, Dropout, Conv1D |
|
from tensorflow.keras import Model |
|
from transformers import BertTokenizer, TFBertModel |
|
from tensorflow.keras.activations import relu |
|
|
|
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score |
|
|
|
|
|
|
|
import os |
|
for dirname, _, filenames in os.walk('/kaggle/input'): |
|
for filename in filenames: |
|
print(os.path.join(dirname, filename)) |
|
|
|
import torch |
|
import numpy as np |
|
from transformers import BertTokenizer, BertModel |
|
import time |
|
from datetime import datetime |
|
import matplotlib.pyplot as plt |
|
import torch |
|
import torch.nn as nn |
|
from torch.optim import Adam |
|
from tqdm import tqdm |
|
from torch.optim.lr_scheduler import ReduceLROnPlateau |
|
|
|
!pip install session_info |
|
|
|
import session_info |
|
session_info.show() |
|
|
|
dataset = pd.read_csv(r"train_with_preprocess.csv") |
|
dataset |
|
|
|
df=dataset[[ "first_p_sec_sw","target"]] |
|
df.columns=["text","target"] |
|
df |
|
|
|
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased") |
|
|
|
labels = {'INSULT':0, |
|
'OTHER':1, |
|
'PROFANITY':2, |
|
'RACIST':3, |
|
'SEXIST':4 |
|
} |
|
|
|
class Dataset(torch.utils.data.Dataset): |
|
|
|
def __init__(self, df): |
|
|
|
self.labels = [labels[label] for label in df['target']] |
|
self.texts = [tokenizer(text, |
|
padding='max_length', max_length = 512, truncation=True, |
|
return_tensors="pt") for text in df['text']] |
|
|
|
def classes(self): |
|
return self.labels |
|
|
|
def __len__(self): |
|
return len(self.labels) |
|
|
|
def get_batch_labels(self, idx): |
|
|
|
return np.array(self.labels[idx]) |
|
|
|
def get_batch_texts(self, idx): |
|
|
|
return self.texts[idx] |
|
|
|
def __getitem__(self, idx): |
|
|
|
batch_texts = self.get_batch_texts(idx) |
|
batch_y = self.get_batch_labels(idx) |
|
|
|
return batch_texts, batch_y |
|
|
|
np.random.seed(112) |
|
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), |
|
[int(.8*len(df)), int(.9*len(df))]) |
|
|
|
print(len(df_train),len(df_val), len(df_test)) |
|
|
|
class BertClassifierConv1D(nn.Module): |
|
def __init__(self, dropout=0.5, num_classes=5): |
|
super(BertClassifierConv1D, self).__init__() |
|
|
|
self.bert = BertModel.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', return_dict=True) |
|
self.conv1d = nn.Conv1d(in_channels=self.bert.config.hidden_size, out_channels=128, kernel_size=5) |
|
self.bilstm = nn.LSTM(input_size=128, hidden_size=64, num_layers=1, bidirectional=True, batch_first=True) |
|
self.dropout = nn.Dropout(dropout) |
|
self.linear = nn.Linear(128, num_classes) |
|
|
|
def forward(self, input_id, mask): |
|
output = self.bert(input_ids=input_id, attention_mask=mask).last_hidden_state |
|
output = output.permute(0, 2, 1) |
|
output = self.conv1d(output) |
|
output, _ = self.bilstm(output.transpose(1, 2)) |
|
output = self.dropout(output) |
|
output = self.linear(output.mean(dim=1)) |
|
return output |
|
|
|
def plot_graphs(history, string): |
|
plt.plot(history[string]) |
|
plt.plot(history['val_'+string]) |
|
plt.xlabel("Epochs") |
|
plt.ylabel(string) |
|
plt.legend([string, 'val_'+string]) |
|
plt.show() |
|
|
|
def train(model, train_data, val_data, learning_rate, epochs,patience=3): |
|
|
|
train, val = Dataset(train_data), Dataset(val_data) |
|
|
|
train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True) |
|
val_dataloader = torch.utils.data.DataLoader(val, batch_size=32) |
|
|
|
use_cuda = torch.cuda.is_available() |
|
device = torch.device("cuda" if use_cuda else "cpu") |
|
|
|
criterion = nn.CrossEntropyLoss() |
|
optimizer = Adam(model.parameters(), lr= learning_rate) |
|
|
|
if use_cuda: |
|
model = model.cuda() |
|
criterion = criterion.cuda() |
|
|
|
history = {'loss': [], 'accuracy': [], 'val_loss': [], 'val_accuracy': []} |
|
best_val_loss = float('inf') |
|
counter = 0 |
|
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.1, verbose=True, cooldown=0) |
|
|
|
for epoch_num in range(epochs): |
|
|
|
total_acc_train = 0 |
|
total_loss_train = 0 |
|
|
|
for train_input, train_label in tqdm(train_dataloader): |
|
|
|
train_label = train_label.to(device) |
|
mask = train_input['attention_mask'].to(device) |
|
input_id = train_input['input_ids'].squeeze(1).to(device) |
|
|
|
output = model(input_id, mask) |
|
|
|
batch_loss = criterion(output, train_label.long()) |
|
total_loss_train += batch_loss.item() |
|
|
|
acc = (output.argmax(dim=1) == train_label).sum().item() |
|
total_acc_train += acc |
|
|
|
model.zero_grad() |
|
batch_loss.backward() |
|
optimizer.step() |
|
|
|
total_acc_val = 0 |
|
total_loss_val = 0 |
|
|
|
with torch.no_grad(): |
|
|
|
for val_input, val_label in val_dataloader: |
|
|
|
val_label = val_label.to(device) |
|
mask = val_input['attention_mask'].to(device) |
|
input_id = val_input['input_ids'].squeeze(1).to(device) |
|
|
|
output = model(input_id, mask) |
|
|
|
batch_loss = criterion(output, val_label.long()) |
|
total_loss_val += batch_loss.item() |
|
|
|
acc = (output.argmax(dim=1) == val_label).sum().item() |
|
total_acc_val += acc |
|
|
|
train_loss = total_loss_train / len(train_data) |
|
train_acc = total_acc_train / len(train_data) |
|
val_loss = total_loss_val / len(val_data) |
|
val_acc = total_acc_val / len(val_data) |
|
history['loss'].append(train_loss) |
|
history['accuracy'].append(train_acc) |
|
history['val_loss'].append(val_loss) |
|
history['val_accuracy'].append(val_acc) |
|
print(f'Epochs: {epoch_num + 1} | Train Loss: {train_loss:.3f} | Train Accuracy: {train_acc:.3f} | Val Loss: {val_loss:.3f} | Val Accuracy: {val_acc:.3f}') |
|
if val_loss < best_val_loss: |
|
best_val_loss = val_loss |
|
counter = 0 |
|
else: |
|
counter += 1 |
|
if counter >= patience: |
|
print(f'Early stopping at epoch {epoch_num+1}') |
|
break |
|
scheduler.step(val_loss) |
|
|
|
plot_graphs(history, "accuracy") |
|
plot_graphs(history, "loss") |
|
EPOCHS = 15 |
|
model = BertClassifierConv1D() |
|
LR = 1e-6 |
|
|
|
train(model, df_train, df_val, LR, EPOCHS) |
|
|
|
!pip install datetime |
|
|
|
now = datetime.now() |
|
seed = int(now.strftime("%Y%m%d%H%M%S")) |
|
print(seed) |
|
random.seed(seed) |
|
random_time=random.randint(0, 350) |
|
model_path= 'model_weights'+str(random_time)+".pth" |
|
torch.save(model.state_dict(), model_path) |
|
print(model_path) |
|
|
|
def evaluate(model, test_data): |
|
|
|
test = Dataset(test_data) |
|
|
|
test_dataloader = torch.utils.data.DataLoader(test, batch_size=32) |
|
|
|
use_cuda = torch.cuda.is_available() |
|
device = torch.device("cuda" if use_cuda else "cpu") |
|
|
|
if use_cuda: |
|
|
|
model = model.cuda() |
|
|
|
total_acc_test = 0 |
|
output_indices = [] |
|
test_labels=[] |
|
with torch.no_grad(): |
|
|
|
for test_input, test_label in test_dataloader: |
|
|
|
test_label = test_label.to(device) |
|
mask = test_input['attention_mask'].to(device) |
|
input_id = test_input['input_ids'].squeeze(1).to(device) |
|
|
|
output = model(input_id, mask) |
|
|
|
acc = (output.argmax(dim=1) == test_label).sum().item() |
|
total_acc_test += acc |
|
|
|
batch_indices = output.argmax(dim=1).tolist() |
|
output_indices.extend(batch_indices) |
|
test_labels.extend(test_label) |
|
|
|
|
|
print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}') |
|
return output_indices, test_labels |
|
y_pred,y_test=evaluate(model, df_test) |
|
|
|
y_pred_tensor = torch.tensor(y_pred) |
|
y_test_tensor = torch.tensor(y_test) |
|
|
|
print(classification_report(np.array(y_pred_tensor.cpu()), np.array(y_test_tensor.cpu()), output_dict=True)) |
|
|
|
from sklearn.metrics import f1_score |
|
f1_score(np.array(y_test_tensor.cpu()),np.array(y_pred_tensor.cpu()), average='macro') |
|
|
|
def conf_matrix(y_test,y_pred): |
|
cm = confusion_matrix(y_test,y_pred, normalize="true") |
|
sns.heatmap(cm, annot=True, cmap="Blues",xticklabels=["INSULT","OTHER","PROFANITY","RACIST","SECIST"],yticklabels=["INSULT","OTHER","PROFANITY","RACIST","SECIST"] ) |
|
plt.xlabel('Tahmin Edilen Sınıf') |
|
plt.ylabel('Gerçek Sınıf') |
|
plt.show() |
|
conf_matrix(np.array(y_pred_tensor.cpu()), np.array(y_test_tensor.cpu())) |