berfinduman commited on
Commit
1590525
·
1 Parent(s): cfa6d57

Upload 2 files

Browse files
Files changed (2) hide show
  1. evaluation_comp.py +213 -0
  2. finalberturk_ensemble.py +296 -0
evaluation_comp.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """evaluation_comp.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1qD1t_GF67fbwftmUYfuMDpwVFICPk5kJ
8
+ """
9
+
10
+ !pip install gradio
11
+
12
+ !pip install transformers
13
+
14
+ import gradio as gr
15
+ import pandas as pd
16
+ from torch import nn
17
+ from transformers import BertModel
18
+ from transformers import BertTokenizer
19
+ from sklearn.metrics import f1_score
20
+ import torch
21
+ import nltk
22
+ nltk.download(['punkt', 'stopwords'])
23
+ import re
24
+
25
+ def remove_short_strings(df:pd.DataFrame, string_column:str)->pd.DataFrame:
26
+ df[string_column] = df[string_column].astype(str)
27
+ df['length'] = df[string_column].str.len()
28
+ df = df.drop(df[df['length'] == 1].index)
29
+ df = df.drop(columns=['length'])
30
+ return df
31
+ def remove_one_character_words(row):
32
+ words = row['text'].split()
33
+ return ' '.join([word for word in words if len(word) > 1])
34
+ def ret_list_to_str(liste):
35
+ return " ".join (i for i in liste)
36
+ def preprocess_tweet(tweet):
37
+ # Convert to lower case
38
+ tweet = tweet.lower()
39
+ # Replace repeating characters
40
+ tweet = re.sub(r'(.)\1+', r'\1\1', tweet)
41
+ # Remove non-Turkish characters
42
+ tweet = re.sub(r'[^a-zA-ZçÇğĞıİöÖşŞüÜ\s]', '', tweet)
43
+ # Remove extra whitespaces
44
+ tweet = re.sub(r'\s+', ' ', tweet).strip()
45
+ return tweet
46
+ def cleaning_stopwords(text,stop_words):
47
+ return " ".join([word for word in str(text).split() if word not in stop_words])
48
+ from nltk.corpus import stopwords
49
+ # Türkçe stop words
50
+ turkish_stopwords = stopwords.words('turkish')
51
+ turkish_stopwords.append("bir")
52
+ turkish_stopwords=set(turkish_stopwords)
53
+ ##burada saçma kelimeler var bunu kullanmayalım
54
+
55
+
56
+ from sklearn import preprocessing
57
+ from nltk.tokenize import word_tokenize
58
+
59
+
60
+ def prep_and_sw_and_tokenize(df):
61
+
62
+ turkish_stopwords = stopwords.words('turkish')
63
+ turkish_stopwords.append("bir")
64
+ stop_words=set(turkish_stopwords)
65
+ df["text"]=df["text"].apply(preprocess_tweet)
66
+ df['text'] = df["text"].apply(lambda text: cleaning_stopwords(text,stop_words))
67
+
68
+ #df['text'] = df.apply(remove_one_character_words, axis=1)
69
+
70
+
71
+ return df
72
+
73
+ tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
74
+ class BertClassifierConv1D(nn.Module):
75
+ def __init__(self, dropout=0.5, num_classes=5):
76
+ super(BertClassifierConv1D, self).__init__()
77
+
78
+ self.bert = BertModel.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', return_dict=True)
79
+ self.conv1d = nn.Conv1d(in_channels=self.bert.config.hidden_size, out_channels=128, kernel_size=5)
80
+ self.bilstm = nn.LSTM(input_size=128, hidden_size=64, num_layers=1, bidirectional=True, batch_first=True)
81
+ self.dropout = nn.Dropout(dropout)
82
+ self.linear = nn.Linear(128, num_classes)
83
+
84
+ def forward(self, input_id, mask):
85
+ output = self.bert(input_ids=input_id, attention_mask=mask).last_hidden_state
86
+ output = output.permute(0, 2, 1) # swap dimensions to prepare for Conv1d layer
87
+ output = self.conv1d(output)
88
+ output, _ = self.bilstm(output.transpose(1, 2))
89
+ output = self.dropout(output)
90
+ output = self.linear(output.mean(dim=1))
91
+ return output
92
+ class Dataset(torch.utils.data.Dataset):
93
+ def __init__(self, df):
94
+ self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors="pt") for text in df]
95
+
96
+ def __len__(self):
97
+ return len(self.texts)
98
+
99
+ def __getitem__(self, idx):
100
+ batch_texts = self.texts[idx]
101
+ return batch_texts
102
+ def evaluate(model, test_data):
103
+
104
+ test = Dataset(test_data)
105
+
106
+ test_dataloader = torch.utils.data.DataLoader(test, batch_size=32)
107
+
108
+ #use_cuda = torch.cuda.is_available()
109
+ #device = torch.device("cuda" if use_cuda else "cpu")
110
+ device= torch.device("cpu")
111
+
112
+ #if use_cuda:
113
+
114
+ # model = model.cuda()
115
+
116
+ total_acc_test = 0
117
+ output_indices = []
118
+ with torch.no_grad():
119
+
120
+ for test_input in test_dataloader:
121
+
122
+ mask = test_input['attention_mask'].to(device)
123
+ input_id = test_input['input_ids'].squeeze(1).to(device)
124
+
125
+ output = model(input_id, mask)
126
+
127
+
128
+ batch_indices = output.argmax(dim=1).tolist()
129
+ output_indices.extend(batch_indices)
130
+
131
+
132
+
133
+ return output_indices
134
+
135
+ def auth(username, password):
136
+ if username == "Hive_Hereos" and password == "Y2IB3HV8GBXED00S":
137
+ return True
138
+ else:
139
+ return False
140
+
141
+ global model
142
+ model =BertClassifierConv1D()
143
+
144
+ model.load_state_dict(torch.load(r"sontotalmodel_finallll.pt", map_location=torch.device('cpu')))
145
+
146
+ import logging
147
+ logging.basicConfig(filename=r'app.log', filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)
148
+
149
+
150
+ def predict(df):
151
+ # TODO:
152
+ df["offensive"] = 1
153
+ df["target"] = None
154
+ # ***************************
155
+ try:
156
+ # WRITE YOUR INFERENCE STEPS BELOW # HERE
157
+ text=df["text"]
158
+ df=prep_and_sw_and_tokenize(df)
159
+ #df.to_csv("preprocess.csv", index=False, sep="|")
160
+ labels = {'INSULT':0,
161
+ 'OTHER':1,
162
+ 'PROFANITY':2,
163
+ 'RACIST':3,
164
+ 'SEXIST':4
165
+ }
166
+ logging.info("Başlıyoruz")
167
+
168
+ logging.info("Model yüklendi")
169
+ logging.info(df.text)
170
+ a=evaluate(model, df["text"])
171
+
172
+ test_labels=[]
173
+ for number in a:
174
+ label = list(labels.keys())[list(labels.values()).index(number)] # Sayıyı etikete dönüştürüyoruz.
175
+ test_labels.append(label) # Yeni etiketi listeye ekliyoruz.
176
+ df["target"]=test_labels
177
+
178
+ for index, row in df.iterrows():
179
+ if row['target'] == 'OTHER':
180
+ df.at[index, 'offensive'] = 0
181
+ df["text"]=text
182
+ except Exception as e:
183
+ logging.error("Error occurred", exc_info=True)
184
+ raise e
185
+ #
186
+ # *********** END ***********
187
+
188
+
189
+ return df
190
+
191
+ def get_file(file):
192
+ output_file = "output_Hive_Hereos.csv"
193
+
194
+ # For windows users, replace path seperator
195
+ file_name = file.name.replace("\\", "/")
196
+
197
+ df = pd.read_csv(file_name, sep="|")
198
+
199
+ predict(df)
200
+ df.to_csv(output_file, index=False, sep="|")
201
+ return (output_file)
202
+
203
+ # Launch the interface with user password
204
+ iface = gr.Interface(get_file, "file", "file")
205
+
206
+ if __name__ == "__main__":
207
+ iface.launch(share=True, auth=auth,debug=True)
208
+
209
+ iface.close()
210
+
211
+ import session_info
212
+ session_info.show()
213
+
finalberturk_ensemble.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """FINALberturk_ensemble.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1yAhhmVl42CAD5BCvUCtjMO7utTU2cGqE
8
+ """
9
+
10
+ !pip install transformers
11
+
12
+ # Commented out IPython magic to ensure Python compatibility.
13
+ import numpy as np # linear algebra
14
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
15
+
16
+ #For EDA
17
+ import matplotlib.pyplot as plt
18
+ import seaborn as sns
19
+
20
+ # Packages for general use throughout the notebook.
21
+ import random
22
+ import warnings
23
+ import time
24
+ # %matplotlib inline
25
+ from sklearn.model_selection import train_test_split
26
+
27
+ # to see columns properly
28
+ pd.set_option('display.max_colwidth', None)
29
+
30
+ # for build our model
31
+ import tensorflow as tf
32
+ from tensorflow.keras.layers import Add, GlobalAvgPool1D, MaxPool1D, Activation, BatchNormalization, Embedding, LSTM, Dense, Bidirectional, Input, SpatialDropout1D, Dropout, Conv1D
33
+ from tensorflow.keras import Model
34
+ from transformers import BertTokenizer, TFBertModel
35
+ from tensorflow.keras.activations import relu
36
+
37
+ from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
38
+
39
+
40
+ # Input data files are available in the read-only "../input/" directory
41
+ import os
42
+ for dirname, _, filenames in os.walk('/kaggle/input'):
43
+ for filename in filenames:
44
+ print(os.path.join(dirname, filename))
45
+
46
+ import torch
47
+ import numpy as np
48
+ from transformers import BertTokenizer, BertModel
49
+ import time
50
+ from datetime import datetime
51
+ import matplotlib.pyplot as plt
52
+ import torch
53
+ import torch.nn as nn
54
+ from torch.optim import Adam
55
+ from tqdm import tqdm
56
+ from torch.optim.lr_scheduler import ReduceLROnPlateau
57
+
58
+ !pip install session_info
59
+
60
+ import session_info
61
+ session_info.show()
62
+
63
+ dataset = pd.read_csv(r"train_with_preprocess.csv")
64
+ dataset
65
+
66
+ df=dataset[[ "first_p_sec_sw","target"]]
67
+ df.columns=["text","target"]
68
+ df
69
+
70
+ tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
71
+
72
+ labels = {'INSULT':0,
73
+ 'OTHER':1,
74
+ 'PROFANITY':2,
75
+ 'RACIST':3,
76
+ 'SEXIST':4
77
+ }
78
+
79
+ class Dataset(torch.utils.data.Dataset):
80
+
81
+ def __init__(self, df):
82
+
83
+ self.labels = [labels[label] for label in df['target']]
84
+ self.texts = [tokenizer(text,
85
+ padding='max_length', max_length = 512, truncation=True,
86
+ return_tensors="pt") for text in df['text']]
87
+
88
+ def classes(self):
89
+ return self.labels
90
+
91
+ def __len__(self):
92
+ return len(self.labels)
93
+
94
+ def get_batch_labels(self, idx):
95
+ # Fetch a batch of labels
96
+ return np.array(self.labels[idx])
97
+
98
+ def get_batch_texts(self, idx):
99
+ # Fetch a batch of inputs
100
+ return self.texts[idx]
101
+
102
+ def __getitem__(self, idx):
103
+
104
+ batch_texts = self.get_batch_texts(idx)
105
+ batch_y = self.get_batch_labels(idx)
106
+
107
+ return batch_texts, batch_y
108
+
109
+ np.random.seed(112)
110
+ df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
111
+ [int(.8*len(df)), int(.9*len(df))])
112
+
113
+ print(len(df_train),len(df_val), len(df_test))
114
+
115
+ class BertClassifierConv1D(nn.Module):
116
+ def __init__(self, dropout=0.5, num_classes=5):
117
+ super(BertClassifierConv1D, self).__init__()
118
+
119
+ self.bert = BertModel.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', return_dict=True)
120
+ self.conv1d = nn.Conv1d(in_channels=self.bert.config.hidden_size, out_channels=128, kernel_size=5)
121
+ self.bilstm = nn.LSTM(input_size=128, hidden_size=64, num_layers=1, bidirectional=True, batch_first=True)
122
+ self.dropout = nn.Dropout(dropout)
123
+ self.linear = nn.Linear(128, num_classes)
124
+
125
+ def forward(self, input_id, mask):
126
+ output = self.bert(input_ids=input_id, attention_mask=mask).last_hidden_state
127
+ output = output.permute(0, 2, 1) # swap dimensions to prepare for Conv1d layer
128
+ output = self.conv1d(output)
129
+ output, _ = self.bilstm(output.transpose(1, 2))
130
+ output = self.dropout(output)
131
+ output = self.linear(output.mean(dim=1))
132
+ return output
133
+
134
+ def plot_graphs(history, string):
135
+ plt.plot(history[string])
136
+ plt.plot(history['val_'+string])
137
+ plt.xlabel("Epochs")
138
+ plt.ylabel(string)
139
+ plt.legend([string, 'val_'+string])
140
+ plt.show()
141
+
142
+ def train(model, train_data, val_data, learning_rate, epochs,patience=3):
143
+
144
+ train, val = Dataset(train_data), Dataset(val_data)
145
+
146
+ train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True)
147
+ val_dataloader = torch.utils.data.DataLoader(val, batch_size=32)
148
+
149
+ use_cuda = torch.cuda.is_available()
150
+ device = torch.device("cuda" if use_cuda else "cpu")
151
+
152
+ criterion = nn.CrossEntropyLoss()
153
+ optimizer = Adam(model.parameters(), lr= learning_rate)
154
+
155
+ if use_cuda:
156
+ model = model.cuda()
157
+ criterion = criterion.cuda()
158
+
159
+ history = {'loss': [], 'accuracy': [], 'val_loss': [], 'val_accuracy': []}
160
+ best_val_loss = float('inf')
161
+ counter = 0
162
+ scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.1, verbose=True, cooldown=0)
163
+
164
+ for epoch_num in range(epochs):
165
+
166
+ total_acc_train = 0
167
+ total_loss_train = 0
168
+
169
+ for train_input, train_label in tqdm(train_dataloader):
170
+
171
+ train_label = train_label.to(device)
172
+ mask = train_input['attention_mask'].to(device)
173
+ input_id = train_input['input_ids'].squeeze(1).to(device)
174
+
175
+ output = model(input_id, mask)
176
+
177
+ batch_loss = criterion(output, train_label.long())
178
+ total_loss_train += batch_loss.item()
179
+
180
+ acc = (output.argmax(dim=1) == train_label).sum().item()
181
+ total_acc_train += acc
182
+
183
+ model.zero_grad()
184
+ batch_loss.backward()
185
+ optimizer.step()
186
+
187
+ total_acc_val = 0
188
+ total_loss_val = 0
189
+
190
+ with torch.no_grad():
191
+
192
+ for val_input, val_label in val_dataloader:
193
+
194
+ val_label = val_label.to(device)
195
+ mask = val_input['attention_mask'].to(device)
196
+ input_id = val_input['input_ids'].squeeze(1).to(device)
197
+
198
+ output = model(input_id, mask)
199
+
200
+ batch_loss = criterion(output, val_label.long())
201
+ total_loss_val += batch_loss.item()
202
+
203
+ acc = (output.argmax(dim=1) == val_label).sum().item()
204
+ total_acc_val += acc
205
+
206
+ train_loss = total_loss_train / len(train_data)
207
+ train_acc = total_acc_train / len(train_data)
208
+ val_loss = total_loss_val / len(val_data)
209
+ val_acc = total_acc_val / len(val_data)
210
+ history['loss'].append(train_loss)
211
+ history['accuracy'].append(train_acc)
212
+ history['val_loss'].append(val_loss)
213
+ history['val_accuracy'].append(val_acc)
214
+ print(f'Epochs: {epoch_num + 1} | Train Loss: {train_loss:.3f} | Train Accuracy: {train_acc:.3f} | Val Loss: {val_loss:.3f} | Val Accuracy: {val_acc:.3f}')
215
+ if val_loss < best_val_loss:
216
+ best_val_loss = val_loss
217
+ counter = 0
218
+ else:
219
+ counter += 1
220
+ if counter >= patience:
221
+ print(f'Early stopping at epoch {epoch_num+1}')
222
+ break
223
+ scheduler.step(val_loss)
224
+
225
+ plot_graphs(history, "accuracy")
226
+ plot_graphs(history, "loss")
227
+ EPOCHS = 15
228
+ model = BertClassifierConv1D()
229
+ LR = 1e-6
230
+
231
+ train(model, df_train, df_val, LR, EPOCHS)
232
+
233
+ !pip install datetime
234
+
235
+ now = datetime.now()
236
+ seed = int(now.strftime("%Y%m%d%H%M%S")) # daily
237
+ print(seed)
238
+ random.seed(seed)
239
+ random_time=random.randint(0, 350)
240
+ model_path= 'model_weights'+str(random_time)+".pth"
241
+ torch.save(model.state_dict(), model_path)
242
+ print(model_path)
243
+
244
+ def evaluate(model, test_data):
245
+
246
+ test = Dataset(test_data)
247
+
248
+ test_dataloader = torch.utils.data.DataLoader(test, batch_size=32)
249
+
250
+ use_cuda = torch.cuda.is_available()
251
+ device = torch.device("cuda" if use_cuda else "cpu")
252
+
253
+ if use_cuda:
254
+
255
+ model = model.cuda()
256
+
257
+ total_acc_test = 0
258
+ output_indices = []
259
+ test_labels=[]
260
+ with torch.no_grad():
261
+
262
+ for test_input, test_label in test_dataloader:
263
+
264
+ test_label = test_label.to(device)
265
+ mask = test_input['attention_mask'].to(device)
266
+ input_id = test_input['input_ids'].squeeze(1).to(device)
267
+
268
+ output = model(input_id, mask)
269
+
270
+ acc = (output.argmax(dim=1) == test_label).sum().item()
271
+ total_acc_test += acc
272
+
273
+ batch_indices = output.argmax(dim=1).tolist()
274
+ output_indices.extend(batch_indices)
275
+ test_labels.extend(test_label)
276
+
277
+
278
+ print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
279
+ return output_indices, test_labels
280
+ y_pred,y_test=evaluate(model, df_test)
281
+
282
+ y_pred_tensor = torch.tensor(y_pred)
283
+ y_test_tensor = torch.tensor(y_test)
284
+
285
+ print(classification_report(np.array(y_pred_tensor.cpu()), np.array(y_test_tensor.cpu()), output_dict=True))
286
+
287
+ from sklearn.metrics import f1_score
288
+ f1_score(np.array(y_test_tensor.cpu()),np.array(y_pred_tensor.cpu()), average='macro')
289
+
290
+ def conf_matrix(y_test,y_pred):
291
+ cm = confusion_matrix(y_test,y_pred, normalize="true")
292
+ sns.heatmap(cm, annot=True, cmap="Blues",xticklabels=["INSULT","OTHER","PROFANITY","RACIST","SECIST"],yticklabels=["INSULT","OTHER","PROFANITY","RACIST","SECIST"] )
293
+ plt.xlabel('Tahmin Edilen Sınıf')
294
+ plt.ylabel('Gerçek Sınıf')
295
+ plt.show()
296
+ conf_matrix(np.array(y_pred_tensor.cpu()), np.array(y_test_tensor.cpu()))