berfinduman
/

bert_ensemble

Text Classification

Model card Files Files and versions Community

berfinduman commited on Apr 7, 2023

Commit

1590525

1 Parent(s): cfa6d57

Upload 2 files

Browse files

Files changed (2) hide show

evaluation_comp.py +213 -0
finalberturk_ensemble.py +296 -0

evaluation_comp.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# -*- coding: utf-8 -*-
+"""evaluation_comp.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1qD1t_GF67fbwftmUYfuMDpwVFICPk5kJ
+"""
+!pip install gradio
+!pip install transformers
+import gradio as gr
+import pandas as pd
+from torch import nn
+from transformers import BertModel
+from transformers import BertTokenizer
+from sklearn.metrics import f1_score
+import torch
+import nltk
+nltk.download(['punkt', 'stopwords'])
+import re
+def remove_short_strings(df:pd.DataFrame, string_column:str)->pd.DataFrame:
+    df[string_column] = df[string_column].astype(str)
+    df['length'] = df[string_column].str.len()
+    df = df.drop(df[df['length'] == 1].index)
+    df = df.drop(columns=['length'])
+    return df
+def remove_one_character_words(row):
+    words = row['text'].split()
+    return ' '.join([word for word in words if len(word) > 1])
+def ret_list_to_str(liste):
+  return " ".join (i for i in liste)
+def preprocess_tweet(tweet):
+    # Convert to lower case
+    tweet = tweet.lower()
+    # Replace repeating characters
+    tweet = re.sub(r'(.)\1+', r'\1\1', tweet)
+    # Remove non-Turkish characters
+    tweet = re.sub(r'[^a-zA-ZçÇğĞıİöÖşŞüÜ\s]', '', tweet)
+    # Remove extra whitespaces
+    tweet = re.sub(r'\s+', ' ', tweet).strip()
+    return tweet
+def cleaning_stopwords(text,stop_words):
+    return " ".join([word for word in str(text).split() if word not in stop_words])
+from nltk.corpus import stopwords
+# Türkçe stop words
+turkish_stopwords = stopwords.words('turkish')
+turkish_stopwords.append("bir")
+turkish_stopwords=set(turkish_stopwords)
+ ##burada saçma kelimeler var bunu kullanmayalım
+from sklearn import preprocessing
+from nltk.tokenize import word_tokenize
+def prep_and_sw_and_tokenize(df):
+  turkish_stopwords = stopwords.words('turkish')
+  turkish_stopwords.append("bir")
+  stop_words=set(turkish_stopwords)
+  df["text"]=df["text"].apply(preprocess_tweet)
+  df['text'] = df["text"].apply(lambda text: cleaning_stopwords(text,stop_words))
+  #df['text'] = df.apply(remove_one_character_words, axis=1)
+  return df
+tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
+class BertClassifierConv1D(nn.Module):
+    def __init__(self, dropout=0.5, num_classes=5):
+        super(BertClassifierConv1D, self).__init__()
+        self.bert = BertModel.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', return_dict=True)
+        self.conv1d = nn.Conv1d(in_channels=self.bert.config.hidden_size, out_channels=128, kernel_size=5)
+        self.bilstm = nn.LSTM(input_size=128, hidden_size=64, num_layers=1, bidirectional=True, batch_first=True)
+        self.dropout = nn.Dropout(dropout)
+        self.linear = nn.Linear(128, num_classes)
+    def forward(self, input_id, mask):
+        output = self.bert(input_ids=input_id, attention_mask=mask).last_hidden_state
+        output = output.permute(0, 2, 1) # swap dimensions to prepare for Conv1d layer
+        output = self.conv1d(output)
+        output, _ = self.bilstm(output.transpose(1, 2))
+        output = self.dropout(output)
+        output = self.linear(output.mean(dim=1))
+        return output
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, df):
+        self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors="pt") for text in df]
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        batch_texts = self.texts[idx]
+        return batch_texts
+def evaluate(model, test_data):
+    test = Dataset(test_data)
+    test_dataloader = torch.utils.data.DataLoader(test, batch_size=32)
+    #use_cuda = torch.cuda.is_available()
+    #device = torch.device("cuda" if use_cuda else "cpu")
+    device= torch.device("cpu")
+    #if use_cuda:
+     #   model = model.cuda()
+    total_acc_test = 0
+    output_indices = []
+    with torch.no_grad():
+        for test_input in test_dataloader:
+              mask = test_input['attention_mask'].to(device)
+              input_id = test_input['input_ids'].squeeze(1).to(device)
+              output = model(input_id, mask)
+              batch_indices = output.argmax(dim=1).tolist()
+              output_indices.extend(batch_indices)
+    return output_indices
+def auth(username, password):
+    if username == "Hive_Hereos" and password == "Y2IB3HV8GBXED00S":
+        return True
+    else:
+        return False
+global model
+model =BertClassifierConv1D()
+model.load_state_dict(torch.load(r"sontotalmodel_finallll.pt", map_location=torch.device('cpu')))
+import logging
+logging.basicConfig(filename=r'app.log', filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)
+def predict(df):
+    # TODO:
+    df["offensive"] = 1
+    df["target"] = None
+    # ***************************
+    try:
+      # WRITE YOUR INFERENCE STEPS BELOW  # HERE
+      text=df["text"]
+      df=prep_and_sw_and_tokenize(df)
+      #df.to_csv("preprocess.csv", index=False, sep="|")
+      labels = {'INSULT':0,
+            'OTHER':1,
+            'PROFANITY':2,
+            'RACIST':3,
+            'SEXIST':4
+            }
+      logging.info("Başlıyoruz")
+      logging.info("Model yüklendi")
+      logging.info(df.text)
+      a=evaluate(model, df["text"])
+      test_labels=[]
+      for number in a:
+          label = list(labels.keys())[list(labels.values()).index(number)]  # Sayıyı etikete dönüştürüyoruz.
+          test_labels.append(label)  # Yeni etiketi listeye ekliyoruz.
+      df["target"]=test_labels
+      for index, row in df.iterrows():
+        if row['target'] == 'OTHER':
+          df.at[index, 'offensive'] = 0
+      df["text"]=text
+    except Exception as e:
+      logging.error("Error occurred", exc_info=True)
+      raise e
+    #
+    # *********** END ***********
+    return df
+def get_file(file):
+    output_file = "output_Hive_Hereos.csv"
+    # For windows users, replace path seperator
+    file_name = file.name.replace("\\", "/")
+    df = pd.read_csv(file_name, sep="|")
+    predict(df)
+    df.to_csv(output_file, index=False, sep="|")
+    return (output_file)
+# Launch the interface with user password
+iface = gr.Interface(get_file, "file", "file")
+if __name__ == "__main__":
+    iface.launch(share=True, auth=auth,debug=True)
+iface.close()
+import session_info
+session_info.show()

finalberturk_ensemble.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# -*- coding: utf-8 -*-
+"""FINALberturk_ensemble.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1yAhhmVl42CAD5BCvUCtjMO7utTU2cGqE
+"""
+!pip install transformers
+# Commented out IPython magic to ensure Python compatibility.
+import numpy as np # linear algebra
+import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
+#For EDA
+import matplotlib.pyplot as plt
+import seaborn as sns
+# Packages for general use throughout the notebook.
+import random
+import warnings
+import time
+# %matplotlib inline
+from sklearn.model_selection import train_test_split
+# to see columns properly
+pd.set_option('display.max_colwidth', None)
+# for build our model
+import tensorflow as tf
+from tensorflow.keras.layers import Add, GlobalAvgPool1D, MaxPool1D, Activation, BatchNormalization, Embedding, LSTM, Dense, Bidirectional, Input, SpatialDropout1D, Dropout, Conv1D
+from tensorflow.keras import Model
+from transformers import BertTokenizer, TFBertModel
+from tensorflow.keras.activations import relu
+from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
+# Input data files are available in the read-only "../input/" directory
+import os
+for dirname, _, filenames in os.walk('/kaggle/input'):
+    for filename in filenames:
+        print(os.path.join(dirname, filename))
+import torch
+import numpy as np
+from transformers import BertTokenizer, BertModel
+import time
+from datetime import datetime
+import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+from torch.optim import Adam
+from tqdm import tqdm
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+!pip install session_info
+import session_info
+session_info.show()
+dataset = pd.read_csv(r"train_with_preprocess.csv")
+dataset
+df=dataset[[ "first_p_sec_sw","target"]]
+df.columns=["text","target"]
+df
+tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
+labels = {'INSULT':0,
+          'OTHER':1,
+          'PROFANITY':2,
+          'RACIST':3,
+          'SEXIST':4
+          }
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, df):
+        self.labels = [labels[label] for label in df['target']]
+        self.texts = [tokenizer(text,
+                               padding='max_length', max_length = 512, truncation=True,
+                                return_tensors="pt") for text in df['text']]
+    def classes(self):
+        return self.labels
+    def __len__(self):
+        return len(self.labels)
+    def get_batch_labels(self, idx):
+        # Fetch a batch of labels
+        return np.array(self.labels[idx])
+    def get_batch_texts(self, idx):
+        # Fetch a batch of inputs
+        return self.texts[idx]
+    def __getitem__(self, idx):
+        batch_texts = self.get_batch_texts(idx)
+        batch_y = self.get_batch_labels(idx)
+        return batch_texts, batch_y
+np.random.seed(112)
+df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
+                                     [int(.8*len(df)), int(.9*len(df))])
+print(len(df_train),len(df_val), len(df_test))
+class BertClassifierConv1D(nn.Module):
+    def __init__(self, dropout=0.5, num_classes=5):
+        super(BertClassifierConv1D, self).__init__()
+        self.bert = BertModel.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', return_dict=True)
+        self.conv1d = nn.Conv1d(in_channels=self.bert.config.hidden_size, out_channels=128, kernel_size=5)
+        self.bilstm = nn.LSTM(input_size=128, hidden_size=64, num_layers=1, bidirectional=True, batch_first=True)
+        self.dropout = nn.Dropout(dropout)
+        self.linear = nn.Linear(128, num_classes)
+    def forward(self, input_id, mask):
+        output = self.bert(input_ids=input_id, attention_mask=mask).last_hidden_state
+        output = output.permute(0, 2, 1) # swap dimensions to prepare for Conv1d layer
+        output = self.conv1d(output)
+        output, _ = self.bilstm(output.transpose(1, 2))
+        output = self.dropout(output)
+        output = self.linear(output.mean(dim=1))
+        return output
+def plot_graphs(history, string):
+    plt.plot(history[string])
+    plt.plot(history['val_'+string])
+    plt.xlabel("Epochs")
+    plt.ylabel(string)
+    plt.legend([string, 'val_'+string])
+    plt.show()
+def train(model, train_data, val_data, learning_rate, epochs,patience=3):
+    train, val = Dataset(train_data), Dataset(val_data)
+    train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True)
+    val_dataloader = torch.utils.data.DataLoader(val, batch_size=32)
+    use_cuda = torch.cuda.is_available()
+    device = torch.device("cuda" if use_cuda else "cpu")
+    criterion = nn.CrossEntropyLoss()
+    optimizer = Adam(model.parameters(), lr= learning_rate)
+    if use_cuda:
+        model = model.cuda()
+        criterion = criterion.cuda()
+    history = {'loss': [], 'accuracy': [], 'val_loss': [], 'val_accuracy': []}
+    best_val_loss = float('inf')
+    counter = 0
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.1, verbose=True, cooldown=0)
+    for epoch_num in range(epochs):
+        total_acc_train = 0
+        total_loss_train = 0
+        for train_input, train_label in tqdm(train_dataloader):
+            train_label = train_label.to(device)
+            mask = train_input['attention_mask'].to(device)
+            input_id = train_input['input_ids'].squeeze(1).to(device)
+            output = model(input_id, mask)
+            batch_loss = criterion(output, train_label.long())
+            total_loss_train += batch_loss.item()
+            acc = (output.argmax(dim=1) == train_label).sum().item()
+            total_acc_train += acc
+            model.zero_grad()
+            batch_loss.backward()
+            optimizer.step()
+        total_acc_val = 0
+        total_loss_val = 0
+        with torch.no_grad():
+            for val_input, val_label in val_dataloader:
+                val_label = val_label.to(device)
+                mask = val_input['attention_mask'].to(device)
+                input_id = val_input['input_ids'].squeeze(1).to(device)
+                output = model(input_id, mask)
+                batch_loss = criterion(output, val_label.long())
+                total_loss_val += batch_loss.item()
+                acc = (output.argmax(dim=1) == val_label).sum().item()
+                total_acc_val += acc
+        train_loss = total_loss_train / len(train_data)
+        train_acc = total_acc_train / len(train_data)
+        val_loss = total_loss_val / len(val_data)
+        val_acc = total_acc_val / len(val_data)
+        history['loss'].append(train_loss)
+        history['accuracy'].append(train_acc)
+        history['val_loss'].append(val_loss)
+        history['val_accuracy'].append(val_acc)
+        print(f'Epochs: {epoch_num + 1} | Train Loss: {train_loss:.3f} | Train Accuracy: {train_acc:.3f} | Val Loss: {val_loss:.3f} | Val Accuracy: {val_acc:.3f}')
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+            counter = 0
+        else:
+            counter += 1
+            if counter >= patience:
+                print(f'Early stopping at epoch {epoch_num+1}')
+                break
+        scheduler.step(val_loss)
+    plot_graphs(history, "accuracy")
+    plot_graphs(history, "loss")
+EPOCHS = 15
+model = BertClassifierConv1D()
+LR = 1e-6
+train(model, df_train, df_val, LR, EPOCHS)
+!pip install datetime
+now = datetime.now()
+seed = int(now.strftime("%Y%m%d%H%M%S"))  # daily
+print(seed)
+random.seed(seed)
+random_time=random.randint(0, 350)
+model_path= 'model_weights'+str(random_time)+".pth"
+torch.save(model.state_dict(), model_path)
+print(model_path)
+def evaluate(model, test_data):
+    test = Dataset(test_data)
+    test_dataloader = torch.utils.data.DataLoader(test, batch_size=32)
+    use_cuda = torch.cuda.is_available()
+    device = torch.device("cuda" if use_cuda else "cpu")
+    if use_cuda:
+        model = model.cuda()
+    total_acc_test = 0
+    output_indices = []
+    test_labels=[]
+    with torch.no_grad():
+        for test_input, test_label in test_dataloader:
+              test_label = test_label.to(device)
+              mask = test_input['attention_mask'].to(device)
+              input_id = test_input['input_ids'].squeeze(1).to(device)
+              output = model(input_id, mask)
+              acc = (output.argmax(dim=1) == test_label).sum().item()
+              total_acc_test += acc
+              batch_indices = output.argmax(dim=1).tolist()
+              output_indices.extend(batch_indices)
+              test_labels.extend(test_label)
+    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
+    return output_indices, test_labels
+y_pred,y_test=evaluate(model, df_test)
+y_pred_tensor = torch.tensor(y_pred)
+y_test_tensor = torch.tensor(y_test)
+print(classification_report(np.array(y_pred_tensor.cpu()), np.array(y_test_tensor.cpu()), output_dict=True))
+from sklearn.metrics import f1_score
+f1_score(np.array(y_test_tensor.cpu()),np.array(y_pred_tensor.cpu()), average='macro')
+def conf_matrix(y_test,y_pred):
+  cm = confusion_matrix(y_test,y_pred, normalize="true")
+  sns.heatmap(cm, annot=True, cmap="Blues",xticklabels=["INSULT","OTHER","PROFANITY","RACIST","SECIST"],yticklabels=["INSULT","OTHER","PROFANITY","RACIST","SECIST"] )
+  plt.xlabel('Tahmin Edilen Sınıf')
+  plt.ylabel('Gerçek Sınıf')
+  plt.show()
+conf_matrix(np.array(y_pred_tensor.cpu()), np.array(y_test_tensor.cpu()))