Spaces:
Sleeping
Sleeping
meghanaraok
commited on
Upload 3 files
Browse files- LongHiLATmain/models/__init__.py +0 -0
- LongHiLATmain/models/modeling.py +350 -0
- LongHiLATmain/models/utils.py +440 -0
LongHiLATmain/models/__init__.py
ADDED
File without changes
|
LongHiLATmain/models/modeling.py
ADDED
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import collections
|
2 |
+
import logging
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from torch.nn import BCEWithLogitsLoss, Dropout, Linear
|
6 |
+
from transformers import AutoModel, XLNetModel, LongformerConfig
|
7 |
+
from transformers.models.longformer.modeling_longformer import LongformerEncoder
|
8 |
+
from huggingface_hub import PyTorchModelHubMixin
|
9 |
+
from LongHiLATmain.hilat.models.utils import initial_code_title_vectors
|
10 |
+
|
11 |
+
logger = logging.getLogger("lwat")
|
12 |
+
|
13 |
+
|
14 |
+
class CodingModelConfig:
|
15 |
+
def __init__(self,
|
16 |
+
transformer_model_name_or_path,
|
17 |
+
transformer_tokenizer_name,
|
18 |
+
transformer_layer_update_strategy,
|
19 |
+
num_chunks,
|
20 |
+
max_seq_length,
|
21 |
+
dropout,
|
22 |
+
dropout_att,
|
23 |
+
d_model,
|
24 |
+
label_dictionary,
|
25 |
+
num_labels,
|
26 |
+
use_code_representation,
|
27 |
+
code_max_seq_length,
|
28 |
+
code_batch_size,
|
29 |
+
multi_head_att,
|
30 |
+
chunk_att,
|
31 |
+
linear_init_mean,
|
32 |
+
linear_init_std,
|
33 |
+
document_pooling_strategy,
|
34 |
+
multi_head_chunk_attention,
|
35 |
+
num_hidden_layers):
|
36 |
+
super(CodingModelConfig, self).__init__()
|
37 |
+
self.transformer_model_name_or_path = transformer_model_name_or_path
|
38 |
+
self.transformer_tokenizer_name = transformer_tokenizer_name
|
39 |
+
self.transformer_layer_update_strategy = transformer_layer_update_strategy
|
40 |
+
self.num_chunks = num_chunks
|
41 |
+
self.max_seq_length = max_seq_length
|
42 |
+
self.dropout = dropout
|
43 |
+
self.dropout_att = dropout_att
|
44 |
+
self.d_model = d_model
|
45 |
+
# labels_dictionary is a dataframe with columns: icd9_code, long_title
|
46 |
+
self.label_dictionary = label_dictionary
|
47 |
+
self.num_labels = num_labels
|
48 |
+
self.use_code_representation = use_code_representation
|
49 |
+
self.code_max_seq_length = code_max_seq_length
|
50 |
+
self.code_batch_size = code_batch_size
|
51 |
+
self.multi_head_att = multi_head_att
|
52 |
+
self.chunk_att = chunk_att
|
53 |
+
self.linear_init_mean = linear_init_mean
|
54 |
+
self.linear_init_std = linear_init_std
|
55 |
+
self.document_pooling_strategy = document_pooling_strategy
|
56 |
+
self.multi_head_chunk_attention = multi_head_chunk_attention
|
57 |
+
self.num_hidden_layers = num_hidden_layers
|
58 |
+
|
59 |
+
|
60 |
+
class LableWiseAttentionLayer(torch.nn.Module):
|
61 |
+
def __init__(self, coding_model_config, args):
|
62 |
+
super(LableWiseAttentionLayer, self).__init__()
|
63 |
+
|
64 |
+
self.config = coding_model_config
|
65 |
+
self.args = args
|
66 |
+
|
67 |
+
# layers
|
68 |
+
self.l1_linear = torch.nn.Linear(self.config.d_model,
|
69 |
+
self.config.d_model, bias=False)
|
70 |
+
self.tanh = torch.nn.Tanh()
|
71 |
+
self.l2_linear = torch.nn.Linear(self.config.d_model, self.config.num_labels, bias=False)
|
72 |
+
self.softmax = torch.nn.Softmax(dim=1)
|
73 |
+
|
74 |
+
# Mean pooling last hidden state of code title from transformer model as the initial code vectors
|
75 |
+
self._init_linear_weights(mean=self.config.linear_init_mean, std=self.config.linear_init_std)
|
76 |
+
|
77 |
+
def _init_linear_weights(self, mean, std):
|
78 |
+
# normalize the l1 weights
|
79 |
+
torch.nn.init.normal_(self.l1_linear.weight, mean, std)
|
80 |
+
if self.l1_linear.bias is not None:
|
81 |
+
self.l1_linear.bias.data.fill_(0)
|
82 |
+
# initialize the l2
|
83 |
+
if self.config.use_code_representation:
|
84 |
+
code_vectors = initial_code_title_vectors(self.config.label_dictionary,
|
85 |
+
self.config.transformer_model_name_or_path,
|
86 |
+
self.config.transformer_tokenizer_name
|
87 |
+
if self.config.transformer_tokenizer_name
|
88 |
+
else self.config.transformer_model_name_or_path,
|
89 |
+
self.config.code_max_seq_length,
|
90 |
+
self.config.code_batch_size,
|
91 |
+
self.config.d_model,
|
92 |
+
self.args.device)
|
93 |
+
|
94 |
+
self.l2_linear.weight = torch.nn.Parameter(code_vectors, requires_grad=True)
|
95 |
+
torch.nn.init.normal_(self.l2_linear.weight, mean, std)
|
96 |
+
if self.l2_linear.bias is not None:
|
97 |
+
self.l2_linear.bias.data.fill_(0)
|
98 |
+
|
99 |
+
def forward(self, x):
|
100 |
+
# input: (batch_size, max_seq_length, transformer_hidden_size)
|
101 |
+
# output: (batch_size, max_seq_length, transformer_hidden_size)
|
102 |
+
# Z = Tan(WH)
|
103 |
+
l1_output = self.tanh(self.l1_linear(x))
|
104 |
+
# softmax(UZ)
|
105 |
+
# l2_linear output shape: (batch_size, max_seq_length, num_labels)
|
106 |
+
# attention_weight shape: (batch_size, num_labels, max_seq_length)
|
107 |
+
attention_weight = self.softmax(self.l2_linear(l1_output)).transpose(1, 2)
|
108 |
+
# attention_output shpae: (batch_size, num_labels, transformer_hidden_size)
|
109 |
+
attention_output = torch.matmul(attention_weight, x)
|
110 |
+
|
111 |
+
return attention_output, attention_weight
|
112 |
+
|
113 |
+
class ChunkAttentionLayer(torch.nn.Module):
|
114 |
+
def __init__(self, coding_model_config, args):
|
115 |
+
super(ChunkAttentionLayer, self).__init__()
|
116 |
+
|
117 |
+
self.config = coding_model_config
|
118 |
+
self.args = args
|
119 |
+
|
120 |
+
# layers
|
121 |
+
self.l1_linear = torch.nn.Linear(self.config.d_model,
|
122 |
+
self.config.d_model, bias=False)
|
123 |
+
self.tanh = torch.nn.Tanh()
|
124 |
+
self.l2_linear = torch.nn.Linear(self.config.d_model, 1, bias=False)
|
125 |
+
self.softmax = torch.nn.Softmax(dim=1)
|
126 |
+
|
127 |
+
self._init_linear_weights(mean=self.config.linear_init_mean, std=self.config.linear_init_std)
|
128 |
+
|
129 |
+
def _init_linear_weights(self, mean, std):
|
130 |
+
# initialize the l1
|
131 |
+
torch.nn.init.normal_(self.l1_linear.weight, mean, std)
|
132 |
+
if self.l1_linear.bias is not None:
|
133 |
+
self.l1_linear.bias.data.fill_(0)
|
134 |
+
# initialize the l2
|
135 |
+
torch.nn.init.normal_(self.l2_linear.weight, mean, std)
|
136 |
+
if self.l2_linear.bias is not None:
|
137 |
+
self.l2_linear.bias.data.fill_(0)
|
138 |
+
|
139 |
+
def forward(self, x):
|
140 |
+
# input: (batch_size, num_chunks, transformer_hidden_size)
|
141 |
+
# output: (batch_size, num_chunks, transformer_hidden_size)
|
142 |
+
# Z = Tan(WH)
|
143 |
+
l1_output = self.tanh(self.l1_linear(x))
|
144 |
+
# softmax(UZ)
|
145 |
+
# l2_linear output shape: (batch_size, num_chunks, 1)
|
146 |
+
# attention_weight shape: (batch_size, 1, num_chunks)
|
147 |
+
attention_weight = self.softmax(self.l2_linear(l1_output)).transpose(1, 2)
|
148 |
+
# attention_output shpae: (batch_size, 1, transformer_hidden_size)
|
149 |
+
attention_output = torch.matmul(attention_weight, x)
|
150 |
+
|
151 |
+
return attention_output, attention_weight
|
152 |
+
|
153 |
+
# define the model class
|
154 |
+
class CodingModel(torch.nn.Module, PyTorchModelHubMixin):
|
155 |
+
def __init__(self, coding_model_config, args, **kwargs):
|
156 |
+
super(CodingModel, self).__init__()
|
157 |
+
self.coding_model_config = coding_model_config
|
158 |
+
self.args = args
|
159 |
+
# layers
|
160 |
+
self.transformer_layer = AutoModel.from_pretrained(self.coding_model_config.transformer_model_name_or_path)
|
161 |
+
if isinstance(self.transformer_layer, XLNetModel):
|
162 |
+
self.transformer_layer.config.use_mems_eval = False
|
163 |
+
self.dropout = Dropout(p=self.coding_model_config.dropout)
|
164 |
+
|
165 |
+
if self.coding_model_config.multi_head_att:
|
166 |
+
# initial multi head attention according to the num_chunks
|
167 |
+
self.label_wise_attention_layer = torch.nn.ModuleList(
|
168 |
+
[LableWiseAttentionLayer(coding_model_config, args)
|
169 |
+
for _ in range(self.coding_model_config.num_chunks)])
|
170 |
+
else:
|
171 |
+
self.label_wise_attention_layer = LableWiseAttentionLayer(coding_model_config, args)
|
172 |
+
self.dropout_att = Dropout(p=self.coding_model_config.dropout_att)
|
173 |
+
|
174 |
+
# initial chunk attention
|
175 |
+
if self.coding_model_config.chunk_att:
|
176 |
+
if self.coding_model_config.multi_head_chunk_attention:
|
177 |
+
self.chunk_attention_layer = torch.nn.ModuleList([ChunkAttentionLayer(coding_model_config, args)
|
178 |
+
for _ in range(self.coding_model_config.num_labels)])
|
179 |
+
else:
|
180 |
+
self.chunk_attention_layer = ChunkAttentionLayer(coding_model_config, args)
|
181 |
+
|
182 |
+
self.classifier_layer = Linear(self.coding_model_config.d_model,
|
183 |
+
self.coding_model_config.num_labels)
|
184 |
+
else:
|
185 |
+
if self.coding_model_config.document_pooling_strategy == "flat":
|
186 |
+
self.classifier_layer = Linear(self.coding_model_config.num_chunks * self.coding_model_config.d_model,
|
187 |
+
self.coding_model_config.num_labels)
|
188 |
+
else: # max or mean pooling
|
189 |
+
self.classifier_layer = Linear(self.coding_model_config.d_model,
|
190 |
+
self.coding_model_config.num_labels)
|
191 |
+
self.sigmoid = torch.nn.Sigmoid()
|
192 |
+
|
193 |
+
if self.coding_model_config.transformer_layer_update_strategy == "no":
|
194 |
+
self.freeze_all_transformer_layers()
|
195 |
+
elif self.coding_model_config.transformer_layer_update_strategy == "last":
|
196 |
+
self.freeze_all_transformer_layers()
|
197 |
+
self.unfreeze_transformer_last_layers()
|
198 |
+
|
199 |
+
# initialize the weights of classifier
|
200 |
+
self._init_linear_weights(mean=self.coding_model_config.linear_init_mean, std=self.coding_model_config.linear_init_std)
|
201 |
+
|
202 |
+
def _init_linear_weights(self, mean, std):
|
203 |
+
torch.nn.init.normal_(self.classifier_layer.weight, mean, std)
|
204 |
+
|
205 |
+
def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor):
|
206 |
+
# longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
|
207 |
+
# (global_attention_mask + 1) => 1 for local attention, 2 for global attention
|
208 |
+
# => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention
|
209 |
+
if attention_mask is not None:
|
210 |
+
attention_mask = attention_mask * (global_attention_mask + 1)
|
211 |
+
else:
|
212 |
+
# simply use `global_attention_mask` as `attention_mask`
|
213 |
+
# if no `attention_mask` is given
|
214 |
+
attention_mask = global_attention_mask + 1
|
215 |
+
return attention_mask
|
216 |
+
|
217 |
+
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, targets=None):
|
218 |
+
# input ids/mask/type_ids shape: (batch_size, num_chunks, max_seq_length)
|
219 |
+
# labels shape: (batch_size, num_labels)
|
220 |
+
transformer_output = []
|
221 |
+
|
222 |
+
# pass chunk by chunk into transformer layer in the batches.
|
223 |
+
# input (batch_size, sequence_length)
|
224 |
+
for i in range(self.coding_model_config.num_chunks):
|
225 |
+
l1_output = self.transformer_layer(input_ids=input_ids[:, i, :],
|
226 |
+
attention_mask=attention_mask[:, i, :],
|
227 |
+
token_type_ids=token_type_ids[:, i, :])
|
228 |
+
# output hidden state shape: (batch_size, sequence_length, hidden_size)
|
229 |
+
transformer_output.append(l1_output[0])
|
230 |
+
|
231 |
+
# transpose back chunk and batch size dimensions
|
232 |
+
transformer_output = torch.stack(transformer_output)
|
233 |
+
transformer_output = transformer_output.transpose(0, 1)
|
234 |
+
# dropout transformer output
|
235 |
+
l2_dropout = self.dropout(transformer_output)
|
236 |
+
|
237 |
+
config = LongformerConfig.from_pretrained("allenai/longformer-base-4096")
|
238 |
+
config.num_labels =5
|
239 |
+
config.num_hidden_layers = 2
|
240 |
+
# self.coding_model_config.num_hidden_layers
|
241 |
+
config.hidden_size = self.coding_model_config.d_model
|
242 |
+
config.attention_window = [512, 512]
|
243 |
+
longformer_layer = LongformerEncoder(config)
|
244 |
+
# longformer_layer = longformer_layer(config)
|
245 |
+
longformer_layer = longformer_layer.to(torch.device("cuda:0"))
|
246 |
+
l2_dropout= l2_dropout.reshape(l2_dropout.shape[0], l2_dropout.shape[1]*l2_dropout.shape[2], l2_dropout.shape[3])
|
247 |
+
attention_mask = attention_mask.reshape(attention_mask.shape[0], attention_mask.shape[1]*attention_mask.shape[2])
|
248 |
+
# is_index_masked = attention_mask < 0
|
249 |
+
|
250 |
+
global_attention_mask = torch.zeros_like(attention_mask)
|
251 |
+
# global attention on cls token
|
252 |
+
global_attention_positions = [0, 512, 1024, 1536, 2048, 2560, 3072, 3584, 4095]
|
253 |
+
global_attention_mask[:, global_attention_positions] = 1
|
254 |
+
|
255 |
+
if global_attention_mask is not None:
|
256 |
+
attention_mask = self._merge_to_attention_mask(attention_mask, global_attention_mask)
|
257 |
+
output = longformer_layer(l2_dropout, attention_mask=attention_mask,output_attentions=True)
|
258 |
+
l2_dropout = self.dropout_att(output[0])
|
259 |
+
l2_dropout = l2_dropout.reshape(l2_dropout.shape[0], self.coding_model_config.num_chunks, self.coding_model_config.max_seq_length, self.coding_model_config.d_model)
|
260 |
+
|
261 |
+
# Label-wise attention layers
|
262 |
+
# output: (batch_size, num_chunks, num_labels, hidden_size)
|
263 |
+
attention_output = []
|
264 |
+
attention_weights = []
|
265 |
+
|
266 |
+
for i in range(self.coding_model_config.num_chunks):
|
267 |
+
# input: (batch_size, max_seq_length, transformer_hidden_size)
|
268 |
+
if self.coding_model_config.multi_head_att:
|
269 |
+
attention_layer = self.label_wise_attention_layer[i]
|
270 |
+
else:
|
271 |
+
attention_layer = self.label_wise_attention_layer
|
272 |
+
l3_attention, attention_weight = attention_layer(l2_dropout[:, i, :])
|
273 |
+
# l3_attention shape: (batch_size, num_labels, hidden_size)
|
274 |
+
# attention_weight: (batch_size, num_labels, max_seq_length)
|
275 |
+
attention_output.append(l3_attention)
|
276 |
+
attention_weights.append(attention_weight)
|
277 |
+
|
278 |
+
attention_output = torch.stack(attention_output)
|
279 |
+
attention_output = attention_output.transpose(0, 1)
|
280 |
+
attention_weights = torch.stack(attention_weights)
|
281 |
+
attention_weights = attention_weights.transpose(0, 1)
|
282 |
+
|
283 |
+
l3_dropout = self.dropout_att(attention_output)
|
284 |
+
|
285 |
+
if self.coding_model_config.chunk_att:
|
286 |
+
# Chunk attention layers
|
287 |
+
# output: (batch_size, num_labels, hidden_size)
|
288 |
+
chunk_attention_output = []
|
289 |
+
chunk_attention_weights = []
|
290 |
+
|
291 |
+
for i in range(self.coding_model_config.num_labels):
|
292 |
+
if self.coding_model_config.multi_head_chunk_attention:
|
293 |
+
chunk_attention = self.chunk_attention_layer[i]
|
294 |
+
else:
|
295 |
+
chunk_attention = self.chunk_attention_layer
|
296 |
+
l4_chunk_attention, l4_chunk_attention_weights = chunk_attention(l3_dropout[:, :, i])
|
297 |
+
chunk_attention_output.append(l4_chunk_attention.squeeze(dim=1))
|
298 |
+
chunk_attention_weights.append(l4_chunk_attention_weights.squeeze(dim=1))
|
299 |
+
|
300 |
+
chunk_attention_output = torch.stack(chunk_attention_output)
|
301 |
+
chunk_attention_output = chunk_attention_output.transpose(0, 1)
|
302 |
+
chunk_attention_weights = torch.stack(chunk_attention_weights)
|
303 |
+
chunk_attention_weights = chunk_attention_weights.transpose(0, 1)
|
304 |
+
# output shape: (batch_size, num_labels, hidden_size)
|
305 |
+
l4_dropout = self.dropout_att(chunk_attention_output)
|
306 |
+
else:
|
307 |
+
# output shape: (batch_size, num_labels, hidden_size*num_chunks)
|
308 |
+
l4_dropout = l3_dropout.transpose(1, 2)
|
309 |
+
if self.coding_model_config.document_pooling_strategy == "flat":
|
310 |
+
# Flatten layer. concatenate representation by labels
|
311 |
+
l4_dropout = torch.flatten(l4_dropout, start_dim=2)
|
312 |
+
elif self.coding_model_config.document_pooling_strategy == "max":
|
313 |
+
l4_dropout = torch.amax(l4_dropout, 2)
|
314 |
+
elif self.coding_model_config.document_pooling_strategy == "mean":
|
315 |
+
l4_dropout = torch.mean(l4_dropout, 2)
|
316 |
+
else:
|
317 |
+
raise ValueError("Not supported pooling strategy")
|
318 |
+
|
319 |
+
# classifier layer
|
320 |
+
# each code has a binary linear formula
|
321 |
+
logits = self.classifier_layer.weight.mul(l4_dropout).sum(dim=2).add(self.classifier_layer.bias)
|
322 |
+
|
323 |
+
loss_fct = BCEWithLogitsLoss()
|
324 |
+
loss = loss_fct(logits, targets)
|
325 |
+
|
326 |
+
return {
|
327 |
+
"loss": loss,
|
328 |
+
"logits": logits,
|
329 |
+
"label_attention_weights": attention_weights,
|
330 |
+
"chunk_attention_weights": chunk_attention_weights if self.coding_model_config.chunk_att else []
|
331 |
+
}
|
332 |
+
|
333 |
+
def freeze_all_transformer_layers(self):
|
334 |
+
"""
|
335 |
+
Freeze all layer weight parameters. They will not be updated during training.
|
336 |
+
"""
|
337 |
+
for param in self.transformer_layer.parameters():
|
338 |
+
param.requires_grad = False
|
339 |
+
|
340 |
+
def unfreeze_all_transformer_layers(self):
|
341 |
+
"""
|
342 |
+
Unfreeze all layers weight parameters. They will be updated during training.
|
343 |
+
"""
|
344 |
+
for param in self.transformer_layer.parameters():
|
345 |
+
param.requires_grad = True
|
346 |
+
|
347 |
+
def unfreeze_transformer_last_layers(self):
|
348 |
+
for name, param in self.transformer_layer.named_parameters():
|
349 |
+
if "layer.11" in name or "pooler" in name:
|
350 |
+
param.requires_grad = True
|
LongHiLATmain/models/utils.py
ADDED
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import linecache
|
3 |
+
import pickle
|
4 |
+
import random
|
5 |
+
import subprocess
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import redis
|
9 |
+
import torch
|
10 |
+
import logging
|
11 |
+
import ast
|
12 |
+
|
13 |
+
from datasets import Dataset
|
14 |
+
from tqdm import tqdm
|
15 |
+
|
16 |
+
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, roc_curve, auc
|
17 |
+
from torch.utils.data import DataLoader
|
18 |
+
from transformers import AutoModel, DataCollatorWithPadding, XLNetTokenizer, XLNetTokenizerFast, AutoTokenizer, \
|
19 |
+
XLNetModel, is_torch_tpu_available
|
20 |
+
|
21 |
+
logger = logging.getLogger("lwat")
|
22 |
+
|
23 |
+
|
24 |
+
class MimicIIIDataset(Dataset):
|
25 |
+
def __init__(self, data):
|
26 |
+
self.input_ids = data["input_ids"]
|
27 |
+
self.attention_mask = data["attention_mask"]
|
28 |
+
self.token_type_ids = data["token_type_ids"]
|
29 |
+
self.labels = data["targets"]
|
30 |
+
|
31 |
+
def __len__(self):
|
32 |
+
return len(self.input_ids)
|
33 |
+
|
34 |
+
def __getitem__(self, item):
|
35 |
+
return {
|
36 |
+
"input_ids": torch.tensor(self.input_ids[item], dtype=torch.long),
|
37 |
+
"attention_mask": torch.tensor(self.attention_mask[item], dtype=torch.float),
|
38 |
+
"token_type_ids": torch.tensor(self.token_type_ids[item], dtype=torch.long),
|
39 |
+
"targets": torch.tensor(self.labels[item], dtype=torch.float)
|
40 |
+
}
|
41 |
+
|
42 |
+
class LazyMimicIIIDataset(Dataset):
|
43 |
+
def __init__(self, filename, task, dataset_type):
|
44 |
+
print("lazy load from {}".format(filename))
|
45 |
+
self.filename = filename
|
46 |
+
self.redis = redis.Redis(unix_socket_path="/tmp/redis.sock")
|
47 |
+
self.pipe = self.redis.pipeline()
|
48 |
+
self.num_examples = 0
|
49 |
+
self.task = task
|
50 |
+
self.dataset_type = dataset_type
|
51 |
+
with open(filename, 'r') as f:
|
52 |
+
for line_num, line in enumerate(f.readlines()):
|
53 |
+
self.num_examples += 1
|
54 |
+
example = eval(line)
|
55 |
+
key = task + '_' + dataset_type + '_' + str(line_num)
|
56 |
+
input_ids = eval(example[0])
|
57 |
+
attention_mask = eval(example[1])
|
58 |
+
token_type_ids = eval(example[2])
|
59 |
+
labels = eval(example[3])
|
60 |
+
example_tuple = (input_ids, attention_mask, token_type_ids, labels)
|
61 |
+
|
62 |
+
self.pipe.set(key, pickle.dumps(example_tuple))
|
63 |
+
if line_num % 100 == 0:
|
64 |
+
self.pipe.execute()
|
65 |
+
self.pipe.execute()
|
66 |
+
if is_torch_tpu_available():
|
67 |
+
import torch_xla.core.xla_model as xm
|
68 |
+
xm.rendezvous(tag="featuresGenerated")
|
69 |
+
|
70 |
+
def __len__(self):
|
71 |
+
return self.num_examples
|
72 |
+
|
73 |
+
def __getitem__(self, item):
|
74 |
+
key = self.task + '_' + self.dataset_type + '_' + str(item)
|
75 |
+
example = pickle.loads(self.redis.get(key))
|
76 |
+
|
77 |
+
return {
|
78 |
+
"input_ids": torch.tensor(example[0], dtype=torch.long),
|
79 |
+
"attention_mask": torch.tensor(example[1], dtype=torch.float),
|
80 |
+
"token_type_ids": torch.tensor(example[2], dtype=torch.long),
|
81 |
+
"targets": torch.tensor(example[3], dtype=torch.float)
|
82 |
+
}
|
83 |
+
|
84 |
+
|
85 |
+
class ICDCodeDataset(Dataset):
|
86 |
+
def __init__(self, data):
|
87 |
+
self.input_ids = data["input_ids"]
|
88 |
+
self.attention_mask = data["attention_mask"]
|
89 |
+
self.token_type_ids = data["token_type_ids"]
|
90 |
+
|
91 |
+
def __len__(self):
|
92 |
+
return len(self.input_ids)
|
93 |
+
|
94 |
+
def __getitem__(self, item):
|
95 |
+
return {
|
96 |
+
"input_ids": torch.tensor(self.input_ids[item], dtype=torch.long),
|
97 |
+
"attention_mask": torch.tensor(self.attention_mask[item], dtype=torch.float),
|
98 |
+
"token_type_ids": torch.tensor(self.token_type_ids[item], dtype=torch.long)
|
99 |
+
}
|
100 |
+
|
101 |
+
|
102 |
+
def set_random_seed(random_seed):
|
103 |
+
random.seed(random_seed)
|
104 |
+
np.random.seed(random_seed)
|
105 |
+
torch.manual_seed(random_seed)
|
106 |
+
torch.cuda.manual_seed_all(random_seed)
|
107 |
+
torch.backends.cudnn.deterministic = True
|
108 |
+
torch.backends.cudnn.benchmark = False
|
109 |
+
|
110 |
+
def tokenize_inputs(text_list, tokenizer, max_seq_len=512):
|
111 |
+
"""
|
112 |
+
Tokenizes the input text input into ids. Appends the appropriate special
|
113 |
+
characters to the end of the text to denote end of sentence. Truncate or pad
|
114 |
+
the appropriate sequence length.
|
115 |
+
"""
|
116 |
+
# tokenize the text, then truncate sequence to the desired length minus 2 for
|
117 |
+
# the 2 special characters
|
118 |
+
tokenized_texts = list(map(lambda t: tokenizer.tokenize(t)[:max_seq_len - 2], text_list))
|
119 |
+
# convert tokenized text into numeric ids for the appropriate LM
|
120 |
+
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
|
121 |
+
# get token type for token_ids_0
|
122 |
+
token_type_ids = [tokenizer.create_token_type_ids_from_sequences(x) for x in input_ids]
|
123 |
+
# append special token to end of sentence: <sep> <cls>
|
124 |
+
input_ids = [tokenizer.build_inputs_with_special_tokens(x) for x in input_ids]
|
125 |
+
# attention mask
|
126 |
+
attention_mask = [[1] * len(x) for x in input_ids]
|
127 |
+
|
128 |
+
# padding to max_length
|
129 |
+
def padding_to_max(sequence, value):
|
130 |
+
padding_len = max_seq_len - len(sequence)
|
131 |
+
padding = [value] * padding_len
|
132 |
+
return sequence + padding
|
133 |
+
|
134 |
+
input_ids = [padding_to_max(x, tokenizer.pad_token_id) for x in input_ids]
|
135 |
+
attention_mask = [padding_to_max(x, 0) for x in attention_mask]
|
136 |
+
token_type_ids = [padding_to_max(x, tokenizer.pad_token_type_id) for x in token_type_ids]
|
137 |
+
|
138 |
+
return input_ids, attention_mask, token_type_ids
|
139 |
+
|
140 |
+
|
141 |
+
def tokenize_dataset(tokenizer, text, labels, max_seq_len):
|
142 |
+
if (isinstance(tokenizer, XLNetTokenizer) or isinstance(tokenizer, XLNetTokenizerFast)):
|
143 |
+
data = list(map(lambda t: tokenize_inputs(t, tokenizer, max_seq_len=max_seq_len), text))
|
144 |
+
input_ids, attention_mask, token_type_ids = zip(*data)
|
145 |
+
else:
|
146 |
+
tokenizer.model_max_length = max_seq_len
|
147 |
+
input_dict = tokenizer(text, padding=True, truncation=True)
|
148 |
+
input_ids = input_dict["input_ids"]
|
149 |
+
attention_mask = input_dict["attention_mask"]
|
150 |
+
token_type_ids = input_dict["token_type_ids"]
|
151 |
+
|
152 |
+
return {
|
153 |
+
"input_ids": input_ids,
|
154 |
+
"attention_mask": attention_mask,
|
155 |
+
"token_type_ids": token_type_ids,
|
156 |
+
"targets": labels
|
157 |
+
}
|
158 |
+
|
159 |
+
|
160 |
+
def initial_code_title_vectors(label_dict, transformer_model_name, tokenizer_name, code_max_seq_length, code_batch_size,
|
161 |
+
d_model, device):
|
162 |
+
logger.info("Generate code title representations from base transformer model")
|
163 |
+
model = AutoModel.from_pretrained(transformer_model_name)
|
164 |
+
if isinstance(model, XLNetModel):
|
165 |
+
model.config.use_mems_eval = False
|
166 |
+
#
|
167 |
+
# model.config.use_mems_eval = False
|
168 |
+
# model.config.reuse_len = 0
|
169 |
+
code_titles = label_dict["long_title"].fillna("").tolist()
|
170 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, padding_side="right")
|
171 |
+
data = tokenizer(code_titles, padding=True, truncation=True)
|
172 |
+
code_dataset = ICDCodeDataset(data)
|
173 |
+
|
174 |
+
model.to(device)
|
175 |
+
|
176 |
+
data_collator = DataCollatorWithPadding(tokenizer, padding="max_length",
|
177 |
+
max_length=code_max_seq_length)
|
178 |
+
code_param = {"batch_size": code_batch_size, "collate_fn": data_collator}
|
179 |
+
code_dataloader = DataLoader(code_dataset, **code_param)
|
180 |
+
|
181 |
+
code_dataloader_progress_bar = tqdm(code_dataloader, unit="batches",
|
182 |
+
desc="Code title representations")
|
183 |
+
code_dataloader_progress_bar.clear()
|
184 |
+
|
185 |
+
# output shape: (num_labels, hidden_size)
|
186 |
+
initial_code_vectors = torch.zeros(len(code_dataset), d_model)
|
187 |
+
|
188 |
+
for i, data in enumerate(code_dataloader_progress_bar):
|
189 |
+
input_ids = data["input_ids"].to(device, dtype=torch.long)
|
190 |
+
attention_mask = data["attention_mask"].to(device, dtype=torch.float)
|
191 |
+
token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
|
192 |
+
|
193 |
+
# output shape: (batch_size, sequence_length, hidden_size)
|
194 |
+
output = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
|
195 |
+
# Mean pooling. output shape: (batch_size, hidden_size)
|
196 |
+
mean_last_hidden_state = torch.mean(output[0], 1)
|
197 |
+
# Max pooling. output shape: (batch_size, hidden_size)
|
198 |
+
# max_last_hidden_state = torch.max((output[0] * attention_mask.unsqueeze(-1)), 1)[0]
|
199 |
+
|
200 |
+
initial_code_vectors[i * input_ids.shape[0]:(i + 1) * input_ids.shape[0], :] = mean_last_hidden_state
|
201 |
+
|
202 |
+
code_dataloader_progress_bar.refresh(True)
|
203 |
+
code_dataloader_progress_bar.clear(True)
|
204 |
+
code_dataloader_progress_bar.close()
|
205 |
+
logger.info("Code representations ready for use. Shape {}".format(initial_code_vectors.shape))
|
206 |
+
return initial_code_vectors
|
207 |
+
|
208 |
+
|
209 |
+
def normalise_labels(labels, n_label):
|
210 |
+
norm_labels = []
|
211 |
+
for label in labels:
|
212 |
+
one_hot_vector_label = [0] * n_label
|
213 |
+
one_hot_vector_label[label] = 1
|
214 |
+
norm_labels.append(one_hot_vector_label)
|
215 |
+
return np.asarray(norm_labels)
|
216 |
+
|
217 |
+
|
218 |
+
def segment_tokenize_inputs(text, tokenizer, max_seq_len, num_chunks):
|
219 |
+
# input is full text of one document
|
220 |
+
tokenized_texts = []
|
221 |
+
tokens = tokenizer.tokenize(text)
|
222 |
+
start_idx = 0
|
223 |
+
seq_len = max_seq_len - 2
|
224 |
+
for i in range(num_chunks):
|
225 |
+
if start_idx > len(tokens):
|
226 |
+
tokenized_texts.append([])
|
227 |
+
continue
|
228 |
+
tokenized_texts.append(tokens[start_idx:(start_idx + seq_len)])
|
229 |
+
start_idx += seq_len
|
230 |
+
|
231 |
+
# convert tokenized text into numeric ids for the appropriate LM
|
232 |
+
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
|
233 |
+
# get token type for token_ids_0
|
234 |
+
token_type_ids = [tokenizer.create_token_type_ids_from_sequences(x) for x in input_ids]
|
235 |
+
# append special token to end of sentence: <sep> <cls>
|
236 |
+
input_ids = [tokenizer.build_inputs_with_special_tokens(x) for x in input_ids]
|
237 |
+
# attention mask
|
238 |
+
attention_mask = [[1] * len(x) for x in input_ids]
|
239 |
+
|
240 |
+
# padding to max_length
|
241 |
+
def padding_to_max(sequence, value):
|
242 |
+
padding_len = max_seq_len - len(sequence)
|
243 |
+
padding = [value] * padding_len
|
244 |
+
return sequence + padding
|
245 |
+
|
246 |
+
input_ids = [padding_to_max(x, tokenizer.pad_token_id) for x in input_ids]
|
247 |
+
attention_mask = [padding_to_max(x, 0) for x in attention_mask]
|
248 |
+
token_type_ids = [padding_to_max(x, tokenizer.pad_token_type_id) for x in token_type_ids]
|
249 |
+
|
250 |
+
return input_ids, attention_mask, token_type_ids
|
251 |
+
|
252 |
+
|
253 |
+
def segment_tokenize_dataset(tokenizer, text, labels, max_seq_len, num_chunks):
|
254 |
+
data = list(
|
255 |
+
map(lambda t: segment_tokenize_inputs(t, tokenizer, max_seq_len, num_chunks), text))
|
256 |
+
input_ids, attention_mask, token_type_ids = zip(*data)
|
257 |
+
|
258 |
+
return {
|
259 |
+
"input_ids": input_ids,
|
260 |
+
"attention_mask": attention_mask,
|
261 |
+
"token_type_ids": token_type_ids,
|
262 |
+
"targets": labels
|
263 |
+
}
|
264 |
+
|
265 |
+
|
266 |
+
# The following functions are modified from the relevant codes of https://github.com/aehrc/LAAT
|
267 |
+
def roc_auc(true_labels, pred_probs, average="macro"):
|
268 |
+
if pred_probs.shape[0] <= 1:
|
269 |
+
return
|
270 |
+
|
271 |
+
fpr = {}
|
272 |
+
tpr = {}
|
273 |
+
if average == "macro":
|
274 |
+
# get AUC for each label individually
|
275 |
+
relevant_labels = []
|
276 |
+
auc_labels = {}
|
277 |
+
for i in range(true_labels.shape[1]):
|
278 |
+
# only if there are true positives for this label
|
279 |
+
if true_labels[:, i].sum() > 0:
|
280 |
+
fpr[i], tpr[i], _ = roc_curve(true_labels[:, i], pred_probs[:, i])
|
281 |
+
if len(fpr[i]) > 1 and len(tpr[i]) > 1:
|
282 |
+
auc_score = auc(fpr[i], tpr[i])
|
283 |
+
if not np.isnan(auc_score):
|
284 |
+
auc_labels["auc_%d" % i] = auc_score
|
285 |
+
relevant_labels.append(i)
|
286 |
+
|
287 |
+
# macro-AUC: just average the auc scores
|
288 |
+
aucs = []
|
289 |
+
for i in relevant_labels:
|
290 |
+
aucs.append(auc_labels['auc_%d' % i])
|
291 |
+
score = np.mean(aucs)
|
292 |
+
else:
|
293 |
+
# micro-AUC: just look at each individual prediction
|
294 |
+
flat_pred = pred_probs.ravel()
|
295 |
+
fpr["micro"], tpr["micro"], _ = roc_curve(true_labels.ravel(), flat_pred)
|
296 |
+
score = auc(fpr["micro"], tpr["micro"])
|
297 |
+
|
298 |
+
return score
|
299 |
+
|
300 |
+
|
301 |
+
def union_size(x, y, axis):
|
302 |
+
return np.logical_or(x, y).sum(axis=axis).astype(float)
|
303 |
+
|
304 |
+
|
305 |
+
def intersect_size(x, y, axis):
|
306 |
+
return np.logical_and(x, y).sum(axis=axis).astype(float)
|
307 |
+
|
308 |
+
|
309 |
+
def macro_accuracy(true_labels, pred_labels):
|
310 |
+
num = intersect_size(true_labels, pred_labels, 0) / (union_size(true_labels, pred_labels, 0) + 1e-10)
|
311 |
+
return np.mean(num)
|
312 |
+
|
313 |
+
|
314 |
+
def macro_precision(true_labels, pred_labels):
|
315 |
+
num = intersect_size(true_labels, pred_labels, 0) / (pred_labels.sum(axis=0) + 1e-10)
|
316 |
+
return np.mean(num)
|
317 |
+
|
318 |
+
|
319 |
+
def macro_recall(true_labels, pred_labels):
|
320 |
+
num = intersect_size(true_labels, pred_labels, 0) / (true_labels.sum(axis=0) + 1e-10)
|
321 |
+
return np.mean(num)
|
322 |
+
|
323 |
+
|
324 |
+
def macro_f1(true_labels, pred_labels):
|
325 |
+
prec = macro_precision(true_labels, pred_labels)
|
326 |
+
rec = macro_recall(true_labels, pred_labels)
|
327 |
+
if prec + rec == 0:
|
328 |
+
f1 = 0.
|
329 |
+
else:
|
330 |
+
f1 = 2 * (prec * rec) / (prec + rec)
|
331 |
+
return prec, rec, f1
|
332 |
+
|
333 |
+
|
334 |
+
def precision_at_k(true_labels, pred_probs, ks=[1, 5, 8, 10, 15]):
|
335 |
+
# num true labels in top k predictions / k
|
336 |
+
sorted_pred = np.argsort(pred_probs)[:, ::-1]
|
337 |
+
output = []
|
338 |
+
for k in ks:
|
339 |
+
topk = sorted_pred[:, :k]
|
340 |
+
|
341 |
+
# get precision at k for each example
|
342 |
+
vals = []
|
343 |
+
for i, tk in enumerate(topk):
|
344 |
+
if len(tk) > 0:
|
345 |
+
num_true_in_top_k = true_labels[i, tk].sum()
|
346 |
+
denom = len(tk)
|
347 |
+
vals.append(num_true_in_top_k / float(denom))
|
348 |
+
|
349 |
+
output.append(np.mean(vals))
|
350 |
+
return output
|
351 |
+
|
352 |
+
|
353 |
+
def micro_recall(true_labels, pred_labels):
|
354 |
+
flat_true = true_labels.ravel()
|
355 |
+
flat_pred = pred_labels.ravel()
|
356 |
+
return intersect_size(flat_true, flat_pred, 0) / flat_true.sum(axis=0)
|
357 |
+
|
358 |
+
|
359 |
+
def micro_precision(true_labels, pred_labels):
|
360 |
+
flat_true = true_labels.ravel()
|
361 |
+
flat_pred = pred_labels.ravel()
|
362 |
+
if flat_pred.sum(axis=0) == 0:
|
363 |
+
return 0.0
|
364 |
+
return intersect_size(flat_true, flat_pred, 0) / flat_pred.sum(axis=0)
|
365 |
+
|
366 |
+
|
367 |
+
def micro_f1(true_labels, pred_labels):
|
368 |
+
prec = micro_precision(true_labels, pred_labels)
|
369 |
+
rec = micro_recall(true_labels, pred_labels)
|
370 |
+
if prec + rec == 0:
|
371 |
+
f1 = 0.
|
372 |
+
else:
|
373 |
+
f1 = 2 * (prec * rec) / (prec + rec)
|
374 |
+
return prec, rec, f1
|
375 |
+
|
376 |
+
|
377 |
+
def micro_accuracy(true_labels, pred_labels):
|
378 |
+
flat_true = true_labels.ravel()
|
379 |
+
flat_pred = pred_labels.ravel()
|
380 |
+
return intersect_size(flat_true, flat_pred, 0) / union_size(flat_true, flat_pred, 0)
|
381 |
+
|
382 |
+
|
383 |
+
def calculate_scores(true_labels, logits, average="macro", is_multilabel=True, threshold=0.5):
|
384 |
+
def sigmoid(x):
|
385 |
+
return 1 / (1 + np.exp(-x))
|
386 |
+
|
387 |
+
pred_probs = sigmoid(logits)
|
388 |
+
pred_labels = np.rint(pred_probs - threshold + 0.5)
|
389 |
+
|
390 |
+
max_size = min(len(true_labels), len(pred_labels))
|
391 |
+
true_labels = true_labels[: max_size]
|
392 |
+
pred_labels = pred_labels[: max_size]
|
393 |
+
pred_probs = pred_probs[: max_size]
|
394 |
+
p_1 = 0
|
395 |
+
p_5 = 0
|
396 |
+
p_8 = 0
|
397 |
+
p_10 = 0
|
398 |
+
p_15 = 0
|
399 |
+
if pred_probs is not None:
|
400 |
+
if not is_multilabel:
|
401 |
+
normalised_labels = normalise_labels(true_labels, len(pred_probs[0]))
|
402 |
+
auc_score = roc_auc(normalised_labels, pred_probs, average=average)
|
403 |
+
accuracy = accuracy_score(true_labels, pred_labels)
|
404 |
+
precision = precision_score(true_labels, pred_labels, average=average)
|
405 |
+
recall = recall_score(true_labels, pred_labels, average=average)
|
406 |
+
f1 = f1_score(true_labels, pred_labels, average=average)
|
407 |
+
else:
|
408 |
+
if average == "macro":
|
409 |
+
accuracy = macro_accuracy(true_labels, pred_labels) # categorical accuracy
|
410 |
+
precision, recall, f1 = macro_f1(true_labels, pred_labels)
|
411 |
+
p_ks = precision_at_k(true_labels, pred_probs, [1, 5, 8, 10, 15])
|
412 |
+
p_1 = p_ks[0]
|
413 |
+
p_5 = p_ks[1]
|
414 |
+
p_8 = p_ks[2]
|
415 |
+
p_10 = p_ks[3]
|
416 |
+
p_15 = p_ks[4]
|
417 |
+
|
418 |
+
else:
|
419 |
+
accuracy = micro_accuracy(true_labels, pred_labels)
|
420 |
+
precision, recall, f1 = micro_f1(true_labels, pred_labels)
|
421 |
+
auc_score = roc_auc(true_labels, pred_probs, average)
|
422 |
+
|
423 |
+
# Calculate label-wise F1 scores
|
424 |
+
labelwise_f1 = f1_score(true_labels, pred_labels, average=None)
|
425 |
+
labelwise_f1 = np.array2string(labelwise_f1, separator=',')
|
426 |
+
|
427 |
+
|
428 |
+
else:
|
429 |
+
auc_score = -1
|
430 |
+
|
431 |
+
output = {"{}_precision".format(average): precision, "{}_recall".format(average): recall,
|
432 |
+
"{}_f1".format(average): f1, "{}_accuracy".format(average): accuracy,
|
433 |
+
"{}_auc".format(average): auc_score, "{}_P@1".format(average): p_1, "{}_P@5".format(average): p_5,
|
434 |
+
"{}_P@8".format(average): p_8, "{}_P@10".format(average): p_10, "{}_P@15".format(average): p_15,
|
435 |
+
"labelwise_f1": labelwise_f1
|
436 |
+
}
|
437 |
+
|
438 |
+
return output
|
439 |
+
|
440 |
+
|