Maslov-Artem commited on
Commit
cb2adb5
·
1 Parent(s): 7983c1c

Add 3 classifiers

Browse files
17/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sberbank-ai/rugpt3small_based_on_gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 1,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 2,
11
+ "gradient_checkpointing": false,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "LABEL_0": 0
18
+ },
19
+ "layer_norm_epsilon": 1e-05,
20
+ "model_type": "gpt2",
21
+ "n_ctx": 2048,
22
+ "n_embd": 768,
23
+ "n_head": 12,
24
+ "n_inner": null,
25
+ "n_layer": 12,
26
+ "n_positions": 2048,
27
+ "pad_token_id": 0,
28
+ "reorder_and_upcast_attn": false,
29
+ "resid_pdrop": 0.1,
30
+ "scale_attn_by_inverse_layer_idx": false,
31
+ "scale_attn_weights": true,
32
+ "summary_activation": null,
33
+ "summary_first_dropout": 0.1,
34
+ "summary_proj_to_labels": true,
35
+ "summary_type": "cls_index",
36
+ "summary_use_proj": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.38.2",
39
+ "use_cache": true,
40
+ "vocab_size": 50264
41
+ }
17/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.38.2"
7
+ }
17/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a99f27f7efc5a609d3bb2f30d15980d3384ecd47f4b0806c251523071a7648a
3
+ size 500941440
model/__init__.py ADDED
File without changes
model/best_bert_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0e1a89d2cb79075e1a4de471ef11654117952234bb65dee721e2909099fa4d4
3
+ size 117120027
model/funcs.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import torch
3
+ import torch.nn as nn
4
+ from sklearn.metrics import f1_score
5
+ from torch.utils.data import Dataset
6
+
7
+
8
+ def create_model_and_tokenizer(model_class, tokenizer_class, pretrained_weights):
9
+ # Создаем объекты для токенизатора и модели
10
+ tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
11
+ model = model_class.from_pretrained(pretrained_weights)
12
+ return model, tokenizer
13
+
14
+
15
+ def train_model(
16
+ DEVICE, epochs, model, train_loader, valid_loader, optimizer, criterion
17
+ ):
18
+ # Создаем папку для сохранения весов, если она еще не существует
19
+ if not os.path.exists("weights"):
20
+ os.makedirs("weights")
21
+
22
+ # Инициализация списков для сохранения значений потерь и точности
23
+ train_losses = []
24
+ train_accuracies = []
25
+ val_losses = []
26
+ val_accuracies = []
27
+ val_f1_scores = []
28
+
29
+ best_val_loss = float("inf")
30
+
31
+ for epoch in range(epochs):
32
+ model.train()
33
+ train_loss = 0.0
34
+ total = 0
35
+ correct = 0
36
+ for batch in train_loader:
37
+ optimizer.zero_grad()
38
+ input_ids, attention_mask, labels = batch
39
+ input_ids = input_ids.to(DEVICE)
40
+ attention_mask = attention_mask.to(DEVICE)
41
+ labels = labels.to(DEVICE)
42
+ outputs = model(input_ids, attention_mask=attention_mask)
43
+ loss = criterion(outputs, labels.float().unsqueeze(1))
44
+ loss.backward()
45
+ optimizer.step()
46
+ train_loss += loss.item()
47
+ preds = torch.round(torch.sigmoid(outputs))
48
+ total += labels.size(0)
49
+ correct += (preds == labels.unsqueeze(1)).sum().item()
50
+
51
+ accuracy = correct / total
52
+ avg_train_loss = train_loss / len(train_loader)
53
+ train_losses.append(avg_train_loss)
54
+ train_accuracies.append(accuracy)
55
+
56
+ model.eval()
57
+ val_loss = 0.0
58
+ total_preds = []
59
+ total_labels = []
60
+ with torch.no_grad():
61
+ total = 0
62
+ correct = 0
63
+ for batch in valid_loader:
64
+ input_ids, attention_mask, labels = batch
65
+ input_ids = input_ids.to(DEVICE)
66
+ attention_mask = attention_mask.to(DEVICE)
67
+ labels = labels.to(DEVICE)
68
+ outputs = model(input_ids, attention_mask=attention_mask)
69
+ loss = criterion(outputs, labels.float().unsqueeze(1))
70
+ val_loss += loss.item()
71
+ preds = torch.round(torch.sigmoid(outputs))
72
+ total += labels.size(0)
73
+ correct += (preds == labels.unsqueeze(1)).sum().item()
74
+ total_preds.extend(preds.detach().cpu().numpy())
75
+ total_labels.extend(labels.detach().cpu().numpy())
76
+
77
+ accuracy = correct / total
78
+ f1 = f1_score(total_labels, total_preds)
79
+ avg_val_loss = val_loss / len(valid_loader)
80
+ val_losses.append(avg_val_loss)
81
+ val_accuracies.append(accuracy)
82
+ val_f1_scores.append(f1)
83
+
84
+ # Если это лучшая модель, сохраняем веса
85
+ if avg_val_loss < best_val_loss:
86
+ best_val_loss = avg_val_loss
87
+ torch.save(model.state_dict(), "weights/best_bert_weights.pth")
88
+
89
+ print(f"Epoch {epoch+1}")
90
+ print(
91
+ f"Training Loss: {train_losses[-1]:.4f}. Validation Loss: {val_losses[-1]:.4f}"
92
+ )
93
+ print(
94
+ f"Training Accuracy : {train_accuracies[-1]:.4f}. Validation Accuracy : {val_accuracies[-1]:.4f}"
95
+ )
96
+ print(25 * "==")
97
+
98
+ return train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
99
+
100
+
101
+ def predict_sentiment(text, model, tokenizer, DEVICE):
102
+ # Модель должна быть в режиме оценки
103
+ model.eval()
104
+
105
+ # Токенизируем текст и конвертируем в тензор
106
+ encoding = tokenizer.encode_plus(
107
+ text, padding="max_length", truncation=True, max_length=512, return_tensors="pt"
108
+ )
109
+ input_ids = encoding["input_ids"].to(DEVICE)
110
+ attention_mask = encoding["attention_mask"].to(DEVICE)
111
+
112
+ # Прогоняем текст через модель
113
+ with torch.no_grad():
114
+ output = model(input_ids, attention_mask=attention_mask)
115
+
116
+ # Преобразуем выход модели в вероятность с помощью сигмоиды
117
+ probability = torch.sigmoid(output).item()
118
+
119
+ # Задаем порог
120
+ threshold = 0.5
121
+
122
+ # Возвращаем вероятность положительного или отрицательного класса
123
+ if probability >= threshold:
124
+ return 1
125
+ # return f"С вероятностью {probability*100:.2f}% это положительный отзыв"
126
+ else:
127
+ return 0
128
+ # return f"С вероятностью {(1-probability)*100:.2f}% это отрицательный отзыв"
129
+
130
+
131
+ def load_model(model_class, pretrained_weights, weights_path):
132
+ # Создаем экземпляр классификатора
133
+ model = ruBERTClassifier(model_class, pretrained_weights)
134
+
135
+ # Загружаем веса
136
+ model.load_state_dict(torch.load(weights_path, map_location="cpu"))
137
+
138
+ return model
139
+
140
+
141
+ def plot_metrics(
142
+ train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
143
+ ):
144
+ epochs = range(1, len(train_losses) + 1)
145
+
146
+ fig, axs = plt.subplots(1, 2, figsize=(15, 5))
147
+
148
+ # Первый подграфик для потерь
149
+ axs[0].plot(epochs, train_losses, "r--", label="Training Loss")
150
+ axs[0].plot(epochs, val_losses, "b--", linewidth=2, label="Validation Loss")
151
+ axs[0].set_title("Training and Validation Loss")
152
+ axs[0].set_xlabel("Epochs")
153
+ axs[0].set_ylabel("Loss")
154
+ axs[0].legend()
155
+
156
+ # Второй подграфик для точности и F1-оценки
157
+ axs[1].plot(epochs, train_accuracies, "r-", linewidth=2, label="Training Accuracy")
158
+ axs[1].plot(epochs, val_accuracies, "b-", linewidth=2, label="Validation Accuracy")
159
+ axs[1].plot(epochs, val_f1_scores, "g-", linewidth=2, label="Validation F1 Score")
160
+ axs[1].set_title("Training and Validation Accuracy and F1 Score")
161
+ axs[1].set_xlabel("Epochs")
162
+ axs[1].set_ylabel("Metric Value")
163
+ axs[1].legend()
164
+
165
+ plt.tight_layout()
166
+ plt.savefig("metrics_plot.png") # Сохраняем рисунок в файл
167
+ plt.show()
168
+
169
+
170
+ class TextClassificationDataset(Dataset):
171
+ def __init__(self, texts, labels, tokenizer):
172
+ self.texts = texts
173
+ self.labels = labels
174
+ self.tokenizer = tokenizer
175
+
176
+ def __len__(self):
177
+ return len(self.texts)
178
+
179
+ def __getitem__(self, idx):
180
+ text = self.texts[idx]
181
+ label = self.labels[idx]
182
+ encoding = self.tokenizer.encode_plus(
183
+ text,
184
+ padding="max_length",
185
+ truncation=True,
186
+ max_length=512,
187
+ return_tensors="pt",
188
+ )
189
+ return (
190
+ encoding["input_ids"].squeeze(),
191
+ encoding["attention_mask"].squeeze(),
192
+ torch.tensor(label),
193
+ )
194
+
195
+
196
+ class ruBERTClassifier(nn.Module):
197
+ def __init__(self, model_class, pretrained_weights):
198
+ super().__init__()
199
+ self.bert = model_class.from_pretrained(pretrained_weights)
200
+ # Замораживаем все параметры
201
+ for param in self.bert.parameters():
202
+ param.requires_grad = False
203
+
204
+ # Размораживаем слой BertPooler
205
+ for param in self.bert.pooler.parameters():
206
+ param.requires_grad = True
207
+
208
+ self.linear = nn.Sequential(
209
+ nn.Linear(312, 256),
210
+ nn.ReLU(),
211
+ nn.Dropout(),
212
+ nn.Linear(256, 1),
213
+ )
214
+
215
+ def forward(self, x, attention_mask):
216
+ bert_out = self.bert(x, attention_mask=attention_mask)[0][:, 0, :]
217
+ out = self.linear(bert_out)
218
+ return out
model/int_vocab.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c109e13d36a06af12b0a0b65fe09cf5af212a12d95ad715b272d3e0a757ca9c7
3
+ size 13374732
model/model.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import Tuple
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+ HIDDEN_SIZE = 32
7
+ VOCAB_SIZE =196906
8
+ EMBEDDING_DIM = 64 # embedding_dim
9
+ SEQ_LEN = 100
10
+ BATCH_SIZE = 64
11
+
12
+
13
+ class BahdanauAttention(nn.Module):
14
+ def __init__(self, hidden_size: int = HIDDEN_SIZE) -> None:
15
+
16
+ super().__init__()
17
+ self.hidden_size = hidden_size
18
+ self.W_q = nn.Linear(hidden_size, hidden_size)
19
+ self.W_k = nn.Linear(hidden_size, hidden_size)
20
+ self.W_v = nn.Linear(hidden_size, 1)
21
+
22
+ self.tanh = nn.Tanh()
23
+
24
+ def forward(
25
+ self,
26
+ lstm_outputs: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
27
+ final_hidden: torch.Tensor, # BATCH_SIZE x HIDDEN_SIZE
28
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
29
+
30
+ """Bahdanau Attention module
31
+
32
+ Args:
33
+ keys (torch.Tensor): lstm hidden states (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
34
+ query (torch.Tensor): lstm final hidden state (BATCH_SIZE, HIDDEN_SIZE)
35
+
36
+ Returns:
37
+ Tuple[torch.Tensor]:
38
+ context_matrix (BATCH_SIZE, HIDDEN_SIZE)
39
+ attention scores (BATCH_SIZE, SEQ_LEN)
40
+ """
41
+ # input:
42
+ # keys – lstm hidden states (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
43
+ # query - lstm final hidden state (BATCH_SIZE, HIDDEN_SIZE)
44
+
45
+ keys = self.W_k(lstm_outputs)
46
+ # print(f'After linear keys: {keys.shape}')
47
+
48
+ query = self.W_q(final_hidden)
49
+ # print(f"After linear query: {query.shape}")
50
+
51
+ # print(f"query.unsqueeze(1) {query.unsqueeze(1).shape}")
52
+
53
+ sum = query.unsqueeze(1) + keys
54
+ # print(f"After sum: {sum.shape}")
55
+
56
+ tanhed = self.tanh(sum)
57
+ # print(f"After tanhed: {tanhed.shape}")
58
+
59
+ vector = self.W_v(tanhed).squeeze(-1)
60
+ # print(f"After linear vector: {vector.shape}")
61
+
62
+ att_weights = torch.softmax(vector, -1)
63
+ # print(f"After softmax att_weights: {att_weights.shape}")
64
+
65
+ context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze()
66
+ # print(f"After bmm context: {context.shape}")
67
+
68
+ return context, att_weights
69
+
70
+ # att_weights = self.linear(lstm_outputs)
71
+ # # print(f'After linear: {att_weights.shape, final_hidden.unsqueeze(2).shape}')
72
+
73
+ # att_weights = self.linear(lstm_outputs)
74
+ # # print(f'After linear: {att_weights.shape, final_hidden.unsqueeze(2).shape}')
75
+ # att_weights = torch.bmm(att_weights, final_hidden.unsqueeze(2))
76
+ # # print(f'After bmm: {att_weights.shape}')
77
+ # att_weights = F.softmax(att_weights.squeeze(2), dim=1)
78
+ # # print(f'After softmax: {att_weights.shape}')
79
+ # cntxt = torch.bmm(lstm_outputs.transpose(1, 2), att_weights.unsqueeze(2))
80
+ # # print(f'Context: {cntxt.shape}')
81
+ # concatted = torch.cat((cntxt, final_hidden.unsqueeze(2)), dim=1)
82
+ # # print(f'Concatted: {concatted.shape}')
83
+ # att_hidden = self.tanh(self.align(concatted.squeeze(-1)))
84
+ # # print(f'Att Hidden: {att_hidden.shape}')
85
+ # return att_hidden, att_weights
86
+
87
+ # Test on random numbers
88
+ BahdanauAttention()(torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE), torch.randn(BATCH_SIZE, HIDDEN_SIZE))[1].shape
89
+
90
+
91
+ class LSTMConcatAttentionEmbed(nn.Module):
92
+ def __init__(self) -> None:
93
+ super().__init__()
94
+
95
+ self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
96
+ # self.embedding = embedding_layer
97
+ self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
98
+ self.attn = BahdanauAttention(HIDDEN_SIZE)
99
+ self.clf = nn.Sequential(
100
+ nn.Linear(HIDDEN_SIZE, 128),
101
+ nn.Dropout(),
102
+ nn.Tanh(),
103
+ nn.Linear(128, 1)
104
+ )
105
+
106
+ def forward(self, x):
107
+ embeddings = self.embedding(x)
108
+ outputs, (h_n, _) = self.lstm(embeddings)
109
+ att_hidden, att_weights = self.attn(outputs, h_n.squeeze(0))
110
+ out = self.clf(att_hidden)
111
+ return out, att_weights
112
+
113
+
model/model_weights.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de960bfb6327e0509297628c3cec5bc456e6dc681b29aca9bead6330e941d44e
3
+ size 50489371
model/vocab.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e07ef2cc3bbaac41bb510936e8b958c808ffdfcf2e60b39bb7c9d330a6fe67f8
3
+ size 12980920
pages/review_predictor.py CHANGED
@@ -1,8 +1,17 @@
 
1
  import pickle
2
 
 
3
  import streamlit as st
 
 
 
4
 
5
- from preprocessing import data_preprocessing
 
 
 
 
6
 
7
  # Load preprocessing steps
8
  with open("vectorizer.pkl", "rb") as f:
@@ -12,29 +21,63 @@ with open("vectorizer.pkl", "rb") as f:
12
  with open("logreg_model.pkl", "rb") as f:
13
  logreg_predictor = pickle.load(f)
14
 
 
 
15
 
16
- # Define function for preprocessing input text
17
- @st.cache
18
- def preprocess_text(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # Apply preprocessing steps (cleaning, tokenization, vectorization)
20
  clean_text = data_preprocessing(
21
  text
22
  ) # Assuming data_preprocessing is your preprocessing function
23
  print("Clean text ", clean_text)
24
- vectorized_text = vectorizer.transform([" ".join(clean_text)])
25
  return vectorized_text
26
 
27
 
28
  # Define function for making predictions
29
- @st.cache
30
- def predict_sentiment(text):
31
  # Preprocess input text
32
- processed_text = preprocess_text(text)
33
  # Make prediction
34
  prediction = logreg_predictor.predict(processed_text)
35
  return prediction
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  st.sidebar.title("Model Selection")
39
  model_type = st.sidebar.radio("Select Model Type", ["Classic ML", "LSTM", "BERT"])
40
  st.title("Review Prediction")
@@ -44,11 +87,14 @@ st.title("Sentiment Analysis with Logistic Regression")
44
  text_input = st.text_input("Enter your review:")
45
  if st.button("Predict"):
46
  if model_type == "Classic ML":
47
- prediction = predict_sentiment(text_input)
48
  elif model_type == "LSTM":
49
- prediction = 1
 
 
50
  elif model_type == "BERT":
51
- prediction = 1
 
52
 
53
  if prediction == 1:
54
  st.write("prediction")
@@ -56,3 +102,5 @@ if st.button("Predict"):
56
  elif prediction == 0:
57
  st.write("prediction")
58
  st.write("Отзыв отрицательный")
 
 
 
1
+ import json
2
  import pickle
3
 
4
+ import pandas as pd
5
  import streamlit as st
6
+ import torch
7
+ import torch.nn as nn
8
+ import transformers
9
 
10
+ from model.funcs import (create_model_and_tokenizer, load_model,
11
+ predict_sentiment)
12
+ from model.model import LSTMConcatAttentionEmbed
13
+ from preprocessing.preprocessing import data_preprocessing
14
+ from preprocessing.rnn_preprocessing import preprocess_single_string
15
 
16
  # Load preprocessing steps
17
  with open("vectorizer.pkl", "rb") as f:
 
21
  with open("logreg_model.pkl", "rb") as f:
22
  logreg_predictor = pickle.load(f)
23
 
24
+ model_concat_embed = LSTMConcatAttentionEmbed()
25
+ model_concat_embed.load_state_dict(torch.load("model/model_weights.pt"))
26
 
27
+ with open("model/vocab.json", "r") as f:
28
+ vocab_to_int = json.load(f)
29
+
30
+ with open("model/int_vocab.json", "r") as f:
31
+ int_to_vocab = json.load(f)
32
+
33
+ model_class = transformers.AutoModel
34
+ tokenizer_class = transformers.AutoTokenizer
35
+ pretrained_weights = "cointegrated/rubert-tiny2"
36
+ weights_path = "model/best_bert_weights.pth"
37
+ model = load_model(model_class, pretrained_weights, weights_path)
38
+ tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
39
+
40
+
41
+ def plot_and_predict(review: str, SEQ_LEN: int, model: nn.Module):
42
+ inp = preprocess_single_string(review, SEQ_LEN, vocab_to_int)
43
+ model.eval()
44
+ with torch.inference_mode():
45
+ pred, _ = model(inp.long().unsqueeze(0))
46
+ pred = pred.sigmoid().item()
47
+ return 1 if pred > 0.75 else 0
48
+
49
+
50
+ def preprocess_text_logreg(text):
51
  # Apply preprocessing steps (cleaning, tokenization, vectorization)
52
  clean_text = data_preprocessing(
53
  text
54
  ) # Assuming data_preprocessing is your preprocessing function
55
  print("Clean text ", clean_text)
56
+ vectorized_text = logreg_vectorizer.transform([" ".join(clean_text)])
57
  return vectorized_text
58
 
59
 
60
  # Define function for making predictions
61
+ def predict_sentiment_logreg(text):
 
62
  # Preprocess input text
63
+ processed_text = preprocess_text_logreg(text)
64
  # Make prediction
65
  prediction = logreg_predictor.predict(processed_text)
66
  return prediction
67
 
68
 
69
+ metrics = {
70
+ "Models": ["Logistic Regression", "LSTM + attention", "ruBERTtiny2"],
71
+ "f1-macro score": [0.94376, 1, 0.94070],
72
+ }
73
+
74
+
75
+ col1, col2 = st.columns([1, 3])
76
+ df = pd.DataFrame(metrics)
77
+ df.set_index("Models", inplace=True)
78
+ df.index.name = "Model"
79
+
80
+
81
  st.sidebar.title("Model Selection")
82
  model_type = st.sidebar.radio("Select Model Type", ["Classic ML", "LSTM", "BERT"])
83
  st.title("Review Prediction")
 
87
  text_input = st.text_input("Enter your review:")
88
  if st.button("Predict"):
89
  if model_type == "Classic ML":
90
+ prediction = predict_sentiment_logreg(text_input)
91
  elif model_type == "LSTM":
92
+ prediction = plot_and_predict(
93
+ review=text_input, SEQ_LEN=25, model=model_concat_embed
94
+ )
95
  elif model_type == "BERT":
96
+ prediction = predict_sentiment(text_input, model, tokenizer, "cpu")
97
+ st.write(prediction)
98
 
99
  if prediction == 1:
100
  st.write("prediction")
 
102
  elif prediction == 0:
103
  st.write("prediction")
104
  st.write("Отзыв отрицательный")
105
+
106
+ st.write(df)
pages/text_generator.py CHANGED
@@ -2,12 +2,17 @@ import streamlit as st
2
  import torch
3
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
4
 
5
- model_path = "finetuned_model/"
6
- model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
7
- tokenizer = GPT2Tokenizer.from_pretrained(model_name)
8
- model = GPT2LMHeadModel.from_pretrained(model_path)
9
 
 
 
 
 
 
 
 
10
 
 
 
11
  promt = st.text_input("Ask a question")
12
  generate = st.button("Generate")
13
  if generate:
 
2
  import torch
3
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
4
 
 
 
 
 
5
 
6
+ @st.cache_data
7
+ def load_model():
8
+ model_path = "17/"
9
+ model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
10
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
11
+ model = GPT2LMHeadModel.from_pretrained(model_path)
12
+ return tokenizer, model
13
 
14
+
15
+ tokenizer, model = load_model()
16
  promt = st.text_input("Ask a question")
17
  generate = st.button("Generate")
18
  if generate:
preprocessing/__init__.py ADDED
File without changes
preprocessing/preprocessing.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+
4
+ import nltk
5
+ import pymorphy2
6
+ from nltk.tokenize import word_tokenize
7
+
8
+ nltk.download("punkt")
9
+
10
+
11
+ def clean_text(text: str) -> str:
12
+ text = text.lower()
13
+ text = re.sub(r"\w*(\w)\1{2,}\w*", "", text)
14
+ text = re.sub(r"\d+\w*", "", text)
15
+ text = re.sub(r"\[.*?\]", "", text)
16
+ text = text.translate(str.maketrans("", "", string.punctuation))
17
+ return text
18
+
19
+
20
+ def lemmize_and_tokenize_text(text: str) -> list[str]:
21
+ morph = pymorphy2.MorphAnalyzer()
22
+ tokens = word_tokenize(text)
23
+ lemmas = [morph.parse(token)[0].normal_form for token in tokens]
24
+ return lemmas
25
+
26
+
27
+ def data_preprocessing(text: str) -> list[str]:
28
+ cleaned_text = clean_text(text)
29
+ lemmized_text = lemmize_and_tokenize_text(cleaned_text)
30
+ return lemmized_text
preprocessing/rnn_preprocessing.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+ import nltk
6
+ nltk.download('stopwords')
7
+ from nltk.corpus import stopwords
8
+ stop_words = set(stopwords.words('english'))
9
+
10
+ def data_preprocessing(text: str) -> str:
11
+ """preprocessing string: lowercase, removing html-tags, punctuation,
12
+ stopwords, digits
13
+
14
+ Args:
15
+ text (str): input string for preprocessing
16
+
17
+ Returns:
18
+ str: preprocessed string
19
+ """
20
+
21
+ text = text.lower()
22
+ text = re.sub('<.*?>', '', text) # html tags
23
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
24
+ text = ' '.join([word for word in text.split() if word not in stop_words])
25
+ text = [word for word in text.split() if not word.isdigit()]
26
+ text = ' '.join(text)
27
+ return text
28
+
29
+ def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
30
+ return list(filter(lambda x: x[1] > n, sorted_words))
31
+
32
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
33
+ """Make left-sided padding for input list of tokens
34
+
35
+ Args:
36
+ review_int (list): input list of tokens
37
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
38
+
39
+ Returns:
40
+ np.array: padded sequences
41
+ """
42
+ features = np.zeros((len(review_int), seq_len), dtype = int)
43
+ for i, review in enumerate(review_int):
44
+ if len(review) <= seq_len:
45
+ zeros = list(np.zeros(seq_len - len(review)))
46
+ new = zeros + review
47
+ else:
48
+ new = review[: seq_len]
49
+ features[i, :] = np.array(new)
50
+
51
+ return features
52
+
53
+ def preprocess_single_string(
54
+ input_string: str,
55
+ seq_len: int,
56
+ vocab_to_int: dict,
57
+ verbose : bool = False
58
+ ) -> torch.tensor:
59
+ """Function for all preprocessing steps on a single string
60
+
61
+ Args:
62
+ input_string (str): input single string for preprocessing
63
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
64
+ vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
65
+
66
+ Returns:
67
+ list: preprocessed string
68
+ """
69
+
70
+ preprocessed_string = data_preprocessing(input_string)
71
+ result_list = []
72
+ for word in preprocessed_string.split():
73
+ try:
74
+ result_list.append(vocab_to_int[word])
75
+ except KeyError as e:
76
+ if verbose:
77
+ print(f'{e}: not in dictionary!')
78
+ pass
79
+ result_padded = padding([result_list], seq_len)[0]
80
+
81
+ return torch.tensor(result_padded)