Maslov-Artem
commited on
Commit
·
cb2adb5
1
Parent(s):
7983c1c
Add 3 classifiers
Browse files- 17/config.json +41 -0
- 17/generation_config.json +7 -0
- 17/model.safetensors +3 -0
- model/__init__.py +0 -0
- model/best_bert_weights.pth +3 -0
- model/funcs.py +218 -0
- model/int_vocab.json +3 -0
- model/model.py +113 -0
- model/model_weights.pt +3 -0
- model/vocab.json +3 -0
- pages/review_predictor.py +59 -11
- pages/text_generator.py +9 -4
- preprocessing/__init__.py +0 -0
- preprocessing/preprocessing.py +30 -0
- preprocessing/rnn_preprocessing.py +81 -0
17/config.json
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "sberbank-ai/rugpt3small_based_on_gpt2",
|
3 |
+
"activation_function": "gelu_new",
|
4 |
+
"architectures": [
|
5 |
+
"GPT2LMHeadModel"
|
6 |
+
],
|
7 |
+
"attn_pdrop": 0.1,
|
8 |
+
"bos_token_id": 1,
|
9 |
+
"embd_pdrop": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"gradient_checkpointing": false,
|
12 |
+
"id2label": {
|
13 |
+
"0": "LABEL_0"
|
14 |
+
},
|
15 |
+
"initializer_range": 0.02,
|
16 |
+
"label2id": {
|
17 |
+
"LABEL_0": 0
|
18 |
+
},
|
19 |
+
"layer_norm_epsilon": 1e-05,
|
20 |
+
"model_type": "gpt2",
|
21 |
+
"n_ctx": 2048,
|
22 |
+
"n_embd": 768,
|
23 |
+
"n_head": 12,
|
24 |
+
"n_inner": null,
|
25 |
+
"n_layer": 12,
|
26 |
+
"n_positions": 2048,
|
27 |
+
"pad_token_id": 0,
|
28 |
+
"reorder_and_upcast_attn": false,
|
29 |
+
"resid_pdrop": 0.1,
|
30 |
+
"scale_attn_by_inverse_layer_idx": false,
|
31 |
+
"scale_attn_weights": true,
|
32 |
+
"summary_activation": null,
|
33 |
+
"summary_first_dropout": 0.1,
|
34 |
+
"summary_proj_to_labels": true,
|
35 |
+
"summary_type": "cls_index",
|
36 |
+
"summary_use_proj": true,
|
37 |
+
"torch_dtype": "float32",
|
38 |
+
"transformers_version": "4.38.2",
|
39 |
+
"use_cache": true,
|
40 |
+
"vocab_size": 50264
|
41 |
+
}
|
17/generation_config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"eos_token_id": 2,
|
5 |
+
"pad_token_id": 0,
|
6 |
+
"transformers_version": "4.38.2"
|
7 |
+
}
|
17/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a99f27f7efc5a609d3bb2f30d15980d3384ecd47f4b0806c251523071a7648a
|
3 |
+
size 500941440
|
model/__init__.py
ADDED
File without changes
|
model/best_bert_weights.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c0e1a89d2cb79075e1a4de471ef11654117952234bb65dee721e2909099fa4d4
|
3 |
+
size 117120027
|
model/funcs.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from sklearn.metrics import f1_score
|
5 |
+
from torch.utils.data import Dataset
|
6 |
+
|
7 |
+
|
8 |
+
def create_model_and_tokenizer(model_class, tokenizer_class, pretrained_weights):
|
9 |
+
# Создаем объекты для токенизатора и модели
|
10 |
+
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
|
11 |
+
model = model_class.from_pretrained(pretrained_weights)
|
12 |
+
return model, tokenizer
|
13 |
+
|
14 |
+
|
15 |
+
def train_model(
|
16 |
+
DEVICE, epochs, model, train_loader, valid_loader, optimizer, criterion
|
17 |
+
):
|
18 |
+
# Создаем папку для сохранения весов, если она еще не существует
|
19 |
+
if not os.path.exists("weights"):
|
20 |
+
os.makedirs("weights")
|
21 |
+
|
22 |
+
# Инициализация списков для сохранения значений потерь и точности
|
23 |
+
train_losses = []
|
24 |
+
train_accuracies = []
|
25 |
+
val_losses = []
|
26 |
+
val_accuracies = []
|
27 |
+
val_f1_scores = []
|
28 |
+
|
29 |
+
best_val_loss = float("inf")
|
30 |
+
|
31 |
+
for epoch in range(epochs):
|
32 |
+
model.train()
|
33 |
+
train_loss = 0.0
|
34 |
+
total = 0
|
35 |
+
correct = 0
|
36 |
+
for batch in train_loader:
|
37 |
+
optimizer.zero_grad()
|
38 |
+
input_ids, attention_mask, labels = batch
|
39 |
+
input_ids = input_ids.to(DEVICE)
|
40 |
+
attention_mask = attention_mask.to(DEVICE)
|
41 |
+
labels = labels.to(DEVICE)
|
42 |
+
outputs = model(input_ids, attention_mask=attention_mask)
|
43 |
+
loss = criterion(outputs, labels.float().unsqueeze(1))
|
44 |
+
loss.backward()
|
45 |
+
optimizer.step()
|
46 |
+
train_loss += loss.item()
|
47 |
+
preds = torch.round(torch.sigmoid(outputs))
|
48 |
+
total += labels.size(0)
|
49 |
+
correct += (preds == labels.unsqueeze(1)).sum().item()
|
50 |
+
|
51 |
+
accuracy = correct / total
|
52 |
+
avg_train_loss = train_loss / len(train_loader)
|
53 |
+
train_losses.append(avg_train_loss)
|
54 |
+
train_accuracies.append(accuracy)
|
55 |
+
|
56 |
+
model.eval()
|
57 |
+
val_loss = 0.0
|
58 |
+
total_preds = []
|
59 |
+
total_labels = []
|
60 |
+
with torch.no_grad():
|
61 |
+
total = 0
|
62 |
+
correct = 0
|
63 |
+
for batch in valid_loader:
|
64 |
+
input_ids, attention_mask, labels = batch
|
65 |
+
input_ids = input_ids.to(DEVICE)
|
66 |
+
attention_mask = attention_mask.to(DEVICE)
|
67 |
+
labels = labels.to(DEVICE)
|
68 |
+
outputs = model(input_ids, attention_mask=attention_mask)
|
69 |
+
loss = criterion(outputs, labels.float().unsqueeze(1))
|
70 |
+
val_loss += loss.item()
|
71 |
+
preds = torch.round(torch.sigmoid(outputs))
|
72 |
+
total += labels.size(0)
|
73 |
+
correct += (preds == labels.unsqueeze(1)).sum().item()
|
74 |
+
total_preds.extend(preds.detach().cpu().numpy())
|
75 |
+
total_labels.extend(labels.detach().cpu().numpy())
|
76 |
+
|
77 |
+
accuracy = correct / total
|
78 |
+
f1 = f1_score(total_labels, total_preds)
|
79 |
+
avg_val_loss = val_loss / len(valid_loader)
|
80 |
+
val_losses.append(avg_val_loss)
|
81 |
+
val_accuracies.append(accuracy)
|
82 |
+
val_f1_scores.append(f1)
|
83 |
+
|
84 |
+
# Если это лучшая модель, сохраняем веса
|
85 |
+
if avg_val_loss < best_val_loss:
|
86 |
+
best_val_loss = avg_val_loss
|
87 |
+
torch.save(model.state_dict(), "weights/best_bert_weights.pth")
|
88 |
+
|
89 |
+
print(f"Epoch {epoch+1}")
|
90 |
+
print(
|
91 |
+
f"Training Loss: {train_losses[-1]:.4f}. Validation Loss: {val_losses[-1]:.4f}"
|
92 |
+
)
|
93 |
+
print(
|
94 |
+
f"Training Accuracy : {train_accuracies[-1]:.4f}. Validation Accuracy : {val_accuracies[-1]:.4f}"
|
95 |
+
)
|
96 |
+
print(25 * "==")
|
97 |
+
|
98 |
+
return train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
|
99 |
+
|
100 |
+
|
101 |
+
def predict_sentiment(text, model, tokenizer, DEVICE):
|
102 |
+
# Модель должна быть в режиме оценки
|
103 |
+
model.eval()
|
104 |
+
|
105 |
+
# Токенизируем текст и конвертируем в тензор
|
106 |
+
encoding = tokenizer.encode_plus(
|
107 |
+
text, padding="max_length", truncation=True, max_length=512, return_tensors="pt"
|
108 |
+
)
|
109 |
+
input_ids = encoding["input_ids"].to(DEVICE)
|
110 |
+
attention_mask = encoding["attention_mask"].to(DEVICE)
|
111 |
+
|
112 |
+
# Прогоняем текст через модель
|
113 |
+
with torch.no_grad():
|
114 |
+
output = model(input_ids, attention_mask=attention_mask)
|
115 |
+
|
116 |
+
# Преобразуем выход модели в вероятность с помощью сигмоиды
|
117 |
+
probability = torch.sigmoid(output).item()
|
118 |
+
|
119 |
+
# Задаем порог
|
120 |
+
threshold = 0.5
|
121 |
+
|
122 |
+
# Возвращаем вероятность положительного или отрицательного класса
|
123 |
+
if probability >= threshold:
|
124 |
+
return 1
|
125 |
+
# return f"С вероятностью {probability*100:.2f}% это положительный отзыв"
|
126 |
+
else:
|
127 |
+
return 0
|
128 |
+
# return f"С вероятностью {(1-probability)*100:.2f}% это отрицательный отзыв"
|
129 |
+
|
130 |
+
|
131 |
+
def load_model(model_class, pretrained_weights, weights_path):
|
132 |
+
# Создаем экземпляр классификатора
|
133 |
+
model = ruBERTClassifier(model_class, pretrained_weights)
|
134 |
+
|
135 |
+
# Загружаем веса
|
136 |
+
model.load_state_dict(torch.load(weights_path, map_location="cpu"))
|
137 |
+
|
138 |
+
return model
|
139 |
+
|
140 |
+
|
141 |
+
def plot_metrics(
|
142 |
+
train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
|
143 |
+
):
|
144 |
+
epochs = range(1, len(train_losses) + 1)
|
145 |
+
|
146 |
+
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
|
147 |
+
|
148 |
+
# Первый подграфик для потерь
|
149 |
+
axs[0].plot(epochs, train_losses, "r--", label="Training Loss")
|
150 |
+
axs[0].plot(epochs, val_losses, "b--", linewidth=2, label="Validation Loss")
|
151 |
+
axs[0].set_title("Training and Validation Loss")
|
152 |
+
axs[0].set_xlabel("Epochs")
|
153 |
+
axs[0].set_ylabel("Loss")
|
154 |
+
axs[0].legend()
|
155 |
+
|
156 |
+
# Второй подграфик для точности и F1-оценки
|
157 |
+
axs[1].plot(epochs, train_accuracies, "r-", linewidth=2, label="Training Accuracy")
|
158 |
+
axs[1].plot(epochs, val_accuracies, "b-", linewidth=2, label="Validation Accuracy")
|
159 |
+
axs[1].plot(epochs, val_f1_scores, "g-", linewidth=2, label="Validation F1 Score")
|
160 |
+
axs[1].set_title("Training and Validation Accuracy and F1 Score")
|
161 |
+
axs[1].set_xlabel("Epochs")
|
162 |
+
axs[1].set_ylabel("Metric Value")
|
163 |
+
axs[1].legend()
|
164 |
+
|
165 |
+
plt.tight_layout()
|
166 |
+
plt.savefig("metrics_plot.png") # Сохраняем рисунок в файл
|
167 |
+
plt.show()
|
168 |
+
|
169 |
+
|
170 |
+
class TextClassificationDataset(Dataset):
|
171 |
+
def __init__(self, texts, labels, tokenizer):
|
172 |
+
self.texts = texts
|
173 |
+
self.labels = labels
|
174 |
+
self.tokenizer = tokenizer
|
175 |
+
|
176 |
+
def __len__(self):
|
177 |
+
return len(self.texts)
|
178 |
+
|
179 |
+
def __getitem__(self, idx):
|
180 |
+
text = self.texts[idx]
|
181 |
+
label = self.labels[idx]
|
182 |
+
encoding = self.tokenizer.encode_plus(
|
183 |
+
text,
|
184 |
+
padding="max_length",
|
185 |
+
truncation=True,
|
186 |
+
max_length=512,
|
187 |
+
return_tensors="pt",
|
188 |
+
)
|
189 |
+
return (
|
190 |
+
encoding["input_ids"].squeeze(),
|
191 |
+
encoding["attention_mask"].squeeze(),
|
192 |
+
torch.tensor(label),
|
193 |
+
)
|
194 |
+
|
195 |
+
|
196 |
+
class ruBERTClassifier(nn.Module):
|
197 |
+
def __init__(self, model_class, pretrained_weights):
|
198 |
+
super().__init__()
|
199 |
+
self.bert = model_class.from_pretrained(pretrained_weights)
|
200 |
+
# Замораживаем все параметры
|
201 |
+
for param in self.bert.parameters():
|
202 |
+
param.requires_grad = False
|
203 |
+
|
204 |
+
# Размораживаем слой BertPooler
|
205 |
+
for param in self.bert.pooler.parameters():
|
206 |
+
param.requires_grad = True
|
207 |
+
|
208 |
+
self.linear = nn.Sequential(
|
209 |
+
nn.Linear(312, 256),
|
210 |
+
nn.ReLU(),
|
211 |
+
nn.Dropout(),
|
212 |
+
nn.Linear(256, 1),
|
213 |
+
)
|
214 |
+
|
215 |
+
def forward(self, x, attention_mask):
|
216 |
+
bert_out = self.bert(x, attention_mask=attention_mask)[0][:, 0, :]
|
217 |
+
out = self.linear(bert_out)
|
218 |
+
return out
|
model/int_vocab.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c109e13d36a06af12b0a0b65fe09cf5af212a12d95ad715b272d3e0a757ca9c7
|
3 |
+
size 13374732
|
model/model.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from typing import Tuple
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
|
6 |
+
HIDDEN_SIZE = 32
|
7 |
+
VOCAB_SIZE =196906
|
8 |
+
EMBEDDING_DIM = 64 # embedding_dim
|
9 |
+
SEQ_LEN = 100
|
10 |
+
BATCH_SIZE = 64
|
11 |
+
|
12 |
+
|
13 |
+
class BahdanauAttention(nn.Module):
|
14 |
+
def __init__(self, hidden_size: int = HIDDEN_SIZE) -> None:
|
15 |
+
|
16 |
+
super().__init__()
|
17 |
+
self.hidden_size = hidden_size
|
18 |
+
self.W_q = nn.Linear(hidden_size, hidden_size)
|
19 |
+
self.W_k = nn.Linear(hidden_size, hidden_size)
|
20 |
+
self.W_v = nn.Linear(hidden_size, 1)
|
21 |
+
|
22 |
+
self.tanh = nn.Tanh()
|
23 |
+
|
24 |
+
def forward(
|
25 |
+
self,
|
26 |
+
lstm_outputs: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
|
27 |
+
final_hidden: torch.Tensor, # BATCH_SIZE x HIDDEN_SIZE
|
28 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
29 |
+
|
30 |
+
"""Bahdanau Attention module
|
31 |
+
|
32 |
+
Args:
|
33 |
+
keys (torch.Tensor): lstm hidden states (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
|
34 |
+
query (torch.Tensor): lstm final hidden state (BATCH_SIZE, HIDDEN_SIZE)
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
Tuple[torch.Tensor]:
|
38 |
+
context_matrix (BATCH_SIZE, HIDDEN_SIZE)
|
39 |
+
attention scores (BATCH_SIZE, SEQ_LEN)
|
40 |
+
"""
|
41 |
+
# input:
|
42 |
+
# keys – lstm hidden states (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
|
43 |
+
# query - lstm final hidden state (BATCH_SIZE, HIDDEN_SIZE)
|
44 |
+
|
45 |
+
keys = self.W_k(lstm_outputs)
|
46 |
+
# print(f'After linear keys: {keys.shape}')
|
47 |
+
|
48 |
+
query = self.W_q(final_hidden)
|
49 |
+
# print(f"After linear query: {query.shape}")
|
50 |
+
|
51 |
+
# print(f"query.unsqueeze(1) {query.unsqueeze(1).shape}")
|
52 |
+
|
53 |
+
sum = query.unsqueeze(1) + keys
|
54 |
+
# print(f"After sum: {sum.shape}")
|
55 |
+
|
56 |
+
tanhed = self.tanh(sum)
|
57 |
+
# print(f"After tanhed: {tanhed.shape}")
|
58 |
+
|
59 |
+
vector = self.W_v(tanhed).squeeze(-1)
|
60 |
+
# print(f"After linear vector: {vector.shape}")
|
61 |
+
|
62 |
+
att_weights = torch.softmax(vector, -1)
|
63 |
+
# print(f"After softmax att_weights: {att_weights.shape}")
|
64 |
+
|
65 |
+
context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze()
|
66 |
+
# print(f"After bmm context: {context.shape}")
|
67 |
+
|
68 |
+
return context, att_weights
|
69 |
+
|
70 |
+
# att_weights = self.linear(lstm_outputs)
|
71 |
+
# # print(f'After linear: {att_weights.shape, final_hidden.unsqueeze(2).shape}')
|
72 |
+
|
73 |
+
# att_weights = self.linear(lstm_outputs)
|
74 |
+
# # print(f'After linear: {att_weights.shape, final_hidden.unsqueeze(2).shape}')
|
75 |
+
# att_weights = torch.bmm(att_weights, final_hidden.unsqueeze(2))
|
76 |
+
# # print(f'After bmm: {att_weights.shape}')
|
77 |
+
# att_weights = F.softmax(att_weights.squeeze(2), dim=1)
|
78 |
+
# # print(f'After softmax: {att_weights.shape}')
|
79 |
+
# cntxt = torch.bmm(lstm_outputs.transpose(1, 2), att_weights.unsqueeze(2))
|
80 |
+
# # print(f'Context: {cntxt.shape}')
|
81 |
+
# concatted = torch.cat((cntxt, final_hidden.unsqueeze(2)), dim=1)
|
82 |
+
# # print(f'Concatted: {concatted.shape}')
|
83 |
+
# att_hidden = self.tanh(self.align(concatted.squeeze(-1)))
|
84 |
+
# # print(f'Att Hidden: {att_hidden.shape}')
|
85 |
+
# return att_hidden, att_weights
|
86 |
+
|
87 |
+
# Test on random numbers
|
88 |
+
BahdanauAttention()(torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE), torch.randn(BATCH_SIZE, HIDDEN_SIZE))[1].shape
|
89 |
+
|
90 |
+
|
91 |
+
class LSTMConcatAttentionEmbed(nn.Module):
|
92 |
+
def __init__(self) -> None:
|
93 |
+
super().__init__()
|
94 |
+
|
95 |
+
self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
|
96 |
+
# self.embedding = embedding_layer
|
97 |
+
self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
|
98 |
+
self.attn = BahdanauAttention(HIDDEN_SIZE)
|
99 |
+
self.clf = nn.Sequential(
|
100 |
+
nn.Linear(HIDDEN_SIZE, 128),
|
101 |
+
nn.Dropout(),
|
102 |
+
nn.Tanh(),
|
103 |
+
nn.Linear(128, 1)
|
104 |
+
)
|
105 |
+
|
106 |
+
def forward(self, x):
|
107 |
+
embeddings = self.embedding(x)
|
108 |
+
outputs, (h_n, _) = self.lstm(embeddings)
|
109 |
+
att_hidden, att_weights = self.attn(outputs, h_n.squeeze(0))
|
110 |
+
out = self.clf(att_hidden)
|
111 |
+
return out, att_weights
|
112 |
+
|
113 |
+
|
model/model_weights.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de960bfb6327e0509297628c3cec5bc456e6dc681b29aca9bead6330e941d44e
|
3 |
+
size 50489371
|
model/vocab.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e07ef2cc3bbaac41bb510936e8b958c808ffdfcf2e60b39bb7c9d330a6fe67f8
|
3 |
+
size 12980920
|
pages/review_predictor.py
CHANGED
@@ -1,8 +1,17 @@
|
|
|
|
1 |
import pickle
|
2 |
|
|
|
3 |
import streamlit as st
|
|
|
|
|
|
|
4 |
|
5 |
-
from
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# Load preprocessing steps
|
8 |
with open("vectorizer.pkl", "rb") as f:
|
@@ -12,29 +21,63 @@ with open("vectorizer.pkl", "rb") as f:
|
|
12 |
with open("logreg_model.pkl", "rb") as f:
|
13 |
logreg_predictor = pickle.load(f)
|
14 |
|
|
|
|
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
# Apply preprocessing steps (cleaning, tokenization, vectorization)
|
20 |
clean_text = data_preprocessing(
|
21 |
text
|
22 |
) # Assuming data_preprocessing is your preprocessing function
|
23 |
print("Clean text ", clean_text)
|
24 |
-
vectorized_text =
|
25 |
return vectorized_text
|
26 |
|
27 |
|
28 |
# Define function for making predictions
|
29 |
-
|
30 |
-
def predict_sentiment(text):
|
31 |
# Preprocess input text
|
32 |
-
processed_text =
|
33 |
# Make prediction
|
34 |
prediction = logreg_predictor.predict(processed_text)
|
35 |
return prediction
|
36 |
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
st.sidebar.title("Model Selection")
|
39 |
model_type = st.sidebar.radio("Select Model Type", ["Classic ML", "LSTM", "BERT"])
|
40 |
st.title("Review Prediction")
|
@@ -44,11 +87,14 @@ st.title("Sentiment Analysis with Logistic Regression")
|
|
44 |
text_input = st.text_input("Enter your review:")
|
45 |
if st.button("Predict"):
|
46 |
if model_type == "Classic ML":
|
47 |
-
prediction =
|
48 |
elif model_type == "LSTM":
|
49 |
-
prediction =
|
|
|
|
|
50 |
elif model_type == "BERT":
|
51 |
-
prediction =
|
|
|
52 |
|
53 |
if prediction == 1:
|
54 |
st.write("prediction")
|
@@ -56,3 +102,5 @@ if st.button("Predict"):
|
|
56 |
elif prediction == 0:
|
57 |
st.write("prediction")
|
58 |
st.write("Отзыв отрицательный")
|
|
|
|
|
|
1 |
+
import json
|
2 |
import pickle
|
3 |
|
4 |
+
import pandas as pd
|
5 |
import streamlit as st
|
6 |
+
import torch
|
7 |
+
import torch.nn as nn
|
8 |
+
import transformers
|
9 |
|
10 |
+
from model.funcs import (create_model_and_tokenizer, load_model,
|
11 |
+
predict_sentiment)
|
12 |
+
from model.model import LSTMConcatAttentionEmbed
|
13 |
+
from preprocessing.preprocessing import data_preprocessing
|
14 |
+
from preprocessing.rnn_preprocessing import preprocess_single_string
|
15 |
|
16 |
# Load preprocessing steps
|
17 |
with open("vectorizer.pkl", "rb") as f:
|
|
|
21 |
with open("logreg_model.pkl", "rb") as f:
|
22 |
logreg_predictor = pickle.load(f)
|
23 |
|
24 |
+
model_concat_embed = LSTMConcatAttentionEmbed()
|
25 |
+
model_concat_embed.load_state_dict(torch.load("model/model_weights.pt"))
|
26 |
|
27 |
+
with open("model/vocab.json", "r") as f:
|
28 |
+
vocab_to_int = json.load(f)
|
29 |
+
|
30 |
+
with open("model/int_vocab.json", "r") as f:
|
31 |
+
int_to_vocab = json.load(f)
|
32 |
+
|
33 |
+
model_class = transformers.AutoModel
|
34 |
+
tokenizer_class = transformers.AutoTokenizer
|
35 |
+
pretrained_weights = "cointegrated/rubert-tiny2"
|
36 |
+
weights_path = "model/best_bert_weights.pth"
|
37 |
+
model = load_model(model_class, pretrained_weights, weights_path)
|
38 |
+
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
|
39 |
+
|
40 |
+
|
41 |
+
def plot_and_predict(review: str, SEQ_LEN: int, model: nn.Module):
|
42 |
+
inp = preprocess_single_string(review, SEQ_LEN, vocab_to_int)
|
43 |
+
model.eval()
|
44 |
+
with torch.inference_mode():
|
45 |
+
pred, _ = model(inp.long().unsqueeze(0))
|
46 |
+
pred = pred.sigmoid().item()
|
47 |
+
return 1 if pred > 0.75 else 0
|
48 |
+
|
49 |
+
|
50 |
+
def preprocess_text_logreg(text):
|
51 |
# Apply preprocessing steps (cleaning, tokenization, vectorization)
|
52 |
clean_text = data_preprocessing(
|
53 |
text
|
54 |
) # Assuming data_preprocessing is your preprocessing function
|
55 |
print("Clean text ", clean_text)
|
56 |
+
vectorized_text = logreg_vectorizer.transform([" ".join(clean_text)])
|
57 |
return vectorized_text
|
58 |
|
59 |
|
60 |
# Define function for making predictions
|
61 |
+
def predict_sentiment_logreg(text):
|
|
|
62 |
# Preprocess input text
|
63 |
+
processed_text = preprocess_text_logreg(text)
|
64 |
# Make prediction
|
65 |
prediction = logreg_predictor.predict(processed_text)
|
66 |
return prediction
|
67 |
|
68 |
|
69 |
+
metrics = {
|
70 |
+
"Models": ["Logistic Regression", "LSTM + attention", "ruBERTtiny2"],
|
71 |
+
"f1-macro score": [0.94376, 1, 0.94070],
|
72 |
+
}
|
73 |
+
|
74 |
+
|
75 |
+
col1, col2 = st.columns([1, 3])
|
76 |
+
df = pd.DataFrame(metrics)
|
77 |
+
df.set_index("Models", inplace=True)
|
78 |
+
df.index.name = "Model"
|
79 |
+
|
80 |
+
|
81 |
st.sidebar.title("Model Selection")
|
82 |
model_type = st.sidebar.radio("Select Model Type", ["Classic ML", "LSTM", "BERT"])
|
83 |
st.title("Review Prediction")
|
|
|
87 |
text_input = st.text_input("Enter your review:")
|
88 |
if st.button("Predict"):
|
89 |
if model_type == "Classic ML":
|
90 |
+
prediction = predict_sentiment_logreg(text_input)
|
91 |
elif model_type == "LSTM":
|
92 |
+
prediction = plot_and_predict(
|
93 |
+
review=text_input, SEQ_LEN=25, model=model_concat_embed
|
94 |
+
)
|
95 |
elif model_type == "BERT":
|
96 |
+
prediction = predict_sentiment(text_input, model, tokenizer, "cpu")
|
97 |
+
st.write(prediction)
|
98 |
|
99 |
if prediction == 1:
|
100 |
st.write("prediction")
|
|
|
102 |
elif prediction == 0:
|
103 |
st.write("prediction")
|
104 |
st.write("Отзыв отрицательный")
|
105 |
+
|
106 |
+
st.write(df)
|
pages/text_generator.py
CHANGED
@@ -2,12 +2,17 @@ import streamlit as st
|
|
2 |
import torch
|
3 |
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
4 |
|
5 |
-
model_path = "finetuned_model/"
|
6 |
-
model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
|
7 |
-
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
8 |
-
model = GPT2LMHeadModel.from_pretrained(model_path)
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
|
|
|
|
11 |
promt = st.text_input("Ask a question")
|
12 |
generate = st.button("Generate")
|
13 |
if generate:
|
|
|
2 |
import torch
|
3 |
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
4 |
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
@st.cache_data
|
7 |
+
def load_model():
|
8 |
+
model_path = "17/"
|
9 |
+
model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
|
10 |
+
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
11 |
+
model = GPT2LMHeadModel.from_pretrained(model_path)
|
12 |
+
return tokenizer, model
|
13 |
|
14 |
+
|
15 |
+
tokenizer, model = load_model()
|
16 |
promt = st.text_input("Ask a question")
|
17 |
generate = st.button("Generate")
|
18 |
if generate:
|
preprocessing/__init__.py
ADDED
File without changes
|
preprocessing/preprocessing.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
|
4 |
+
import nltk
|
5 |
+
import pymorphy2
|
6 |
+
from nltk.tokenize import word_tokenize
|
7 |
+
|
8 |
+
nltk.download("punkt")
|
9 |
+
|
10 |
+
|
11 |
+
def clean_text(text: str) -> str:
|
12 |
+
text = text.lower()
|
13 |
+
text = re.sub(r"\w*(\w)\1{2,}\w*", "", text)
|
14 |
+
text = re.sub(r"\d+\w*", "", text)
|
15 |
+
text = re.sub(r"\[.*?\]", "", text)
|
16 |
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
17 |
+
return text
|
18 |
+
|
19 |
+
|
20 |
+
def lemmize_and_tokenize_text(text: str) -> list[str]:
|
21 |
+
morph = pymorphy2.MorphAnalyzer()
|
22 |
+
tokens = word_tokenize(text)
|
23 |
+
lemmas = [morph.parse(token)[0].normal_form for token in tokens]
|
24 |
+
return lemmas
|
25 |
+
|
26 |
+
|
27 |
+
def data_preprocessing(text: str) -> list[str]:
|
28 |
+
cleaned_text = clean_text(text)
|
29 |
+
lemmized_text = lemmize_and_tokenize_text(cleaned_text)
|
30 |
+
return lemmized_text
|
preprocessing/rnn_preprocessing.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import nltk
|
6 |
+
nltk.download('stopwords')
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
stop_words = set(stopwords.words('english'))
|
9 |
+
|
10 |
+
def data_preprocessing(text: str) -> str:
|
11 |
+
"""preprocessing string: lowercase, removing html-tags, punctuation,
|
12 |
+
stopwords, digits
|
13 |
+
|
14 |
+
Args:
|
15 |
+
text (str): input string for preprocessing
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
str: preprocessed string
|
19 |
+
"""
|
20 |
+
|
21 |
+
text = text.lower()
|
22 |
+
text = re.sub('<.*?>', '', text) # html tags
|
23 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
24 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
25 |
+
text = [word for word in text.split() if not word.isdigit()]
|
26 |
+
text = ' '.join(text)
|
27 |
+
return text
|
28 |
+
|
29 |
+
def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
|
30 |
+
return list(filter(lambda x: x[1] > n, sorted_words))
|
31 |
+
|
32 |
+
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
|
33 |
+
"""Make left-sided padding for input list of tokens
|
34 |
+
|
35 |
+
Args:
|
36 |
+
review_int (list): input list of tokens
|
37 |
+
seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
np.array: padded sequences
|
41 |
+
"""
|
42 |
+
features = np.zeros((len(review_int), seq_len), dtype = int)
|
43 |
+
for i, review in enumerate(review_int):
|
44 |
+
if len(review) <= seq_len:
|
45 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
46 |
+
new = zeros + review
|
47 |
+
else:
|
48 |
+
new = review[: seq_len]
|
49 |
+
features[i, :] = np.array(new)
|
50 |
+
|
51 |
+
return features
|
52 |
+
|
53 |
+
def preprocess_single_string(
|
54 |
+
input_string: str,
|
55 |
+
seq_len: int,
|
56 |
+
vocab_to_int: dict,
|
57 |
+
verbose : bool = False
|
58 |
+
) -> torch.tensor:
|
59 |
+
"""Function for all preprocessing steps on a single string
|
60 |
+
|
61 |
+
Args:
|
62 |
+
input_string (str): input single string for preprocessing
|
63 |
+
seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
|
64 |
+
vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
list: preprocessed string
|
68 |
+
"""
|
69 |
+
|
70 |
+
preprocessed_string = data_preprocessing(input_string)
|
71 |
+
result_list = []
|
72 |
+
for word in preprocessed_string.split():
|
73 |
+
try:
|
74 |
+
result_list.append(vocab_to_int[word])
|
75 |
+
except KeyError as e:
|
76 |
+
if verbose:
|
77 |
+
print(f'{e}: not in dictionary!')
|
78 |
+
pass
|
79 |
+
result_padded = padding([result_list], seq_len)[0]
|
80 |
+
|
81 |
+
return torch.tensor(result_padded)
|