Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -32,6 +32,142 @@ else:
|
|
32 |
device = torch.device("cpu")
|
33 |
print('No GPU available, using the CPU instead.')
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
class BERTDataset(Dataset):
|
36 |
def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
|
37 |
pad, pair):
|
|
|
32 |
device = torch.device("cpu")
|
33 |
print('No GPU available, using the CPU instead.')
|
34 |
|
35 |
+
|
36 |
+
class BERTSentenceTransform:
|
37 |
+
r"""BERT style data transformation.
|
38 |
+
|
39 |
+
Parameters
|
40 |
+
----------
|
41 |
+
tokenizer : BERTTokenizer.
|
42 |
+
Tokenizer for the sentences.
|
43 |
+
max_seq_length : int.
|
44 |
+
Maximum sequence length of the sentences.
|
45 |
+
pad : bool, default True
|
46 |
+
Whether to pad the sentences to maximum length.
|
47 |
+
pair : bool, default True
|
48 |
+
Whether to transform sentences or sentence pairs.
|
49 |
+
"""
|
50 |
+
|
51 |
+
# μ
λ ₯μΌλ‘ λ°μ tokenizerm μ΅λ μνμ€ κΈΈμ΄, vocab, pad λ° pair μ€μ
|
52 |
+
def __init__(self, tokenizer, max_seq_length,vocab, pad=True, pair=True):
|
53 |
+
self._tokenizer = tokenizer
|
54 |
+
self._max_seq_length = max_seq_length
|
55 |
+
self._pad = pad
|
56 |
+
self._pair = pair
|
57 |
+
self._vocab = vocab
|
58 |
+
|
59 |
+
# μ
λ ₯λ λ¬Έμ₯ λλ λ¬Έμ₯ μμ BERT λͺ¨λΈμ΄ μ¬μ©ν μ μλ νμμΌλ‘ λ³ν
|
60 |
+
def __call__(self, line):
|
61 |
+
"""Perform transformation for sequence pairs or single sequences.
|
62 |
+
|
63 |
+
The transformation is processed in the following steps:
|
64 |
+
- tokenize the input sequences
|
65 |
+
- insert [CLS], [SEP] as necessary
|
66 |
+
- generate type ids to indicate whether a token belongs to the first
|
67 |
+
sequence or the second sequence.
|
68 |
+
- generate valid length
|
69 |
+
|
70 |
+
For sequence pairs, the input is a tuple of 2 strings:
|
71 |
+
text_a, text_b.
|
72 |
+
|
73 |
+
Inputs:
|
74 |
+
text_a: 'is this jacksonville ?'
|
75 |
+
text_b: 'no it is not'
|
76 |
+
Tokenization:
|
77 |
+
text_a: 'is this jack ##son ##ville ?'
|
78 |
+
text_b: 'no it is not .'
|
79 |
+
Processed:
|
80 |
+
tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
|
81 |
+
type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
82 |
+
valid_length: 14
|
83 |
+
|
84 |
+
For single sequences, the input is a tuple of single string:
|
85 |
+
text_a.
|
86 |
+
|
87 |
+
Inputs:
|
88 |
+
text_a: 'the dog is hairy .'
|
89 |
+
Tokenization:
|
90 |
+
text_a: 'the dog is hairy .'
|
91 |
+
Processed:
|
92 |
+
text_a: '[CLS] the dog is hairy . [SEP]'
|
93 |
+
type_ids: 0 0 0 0 0 0 0
|
94 |
+
valid_length: 7
|
95 |
+
|
96 |
+
Parameters
|
97 |
+
----------
|
98 |
+
line: tuple of str
|
99 |
+
Input strings. For sequence pairs, the input is a tuple of 2 strings:
|
100 |
+
(text_a, text_b). For single sequences, the input is a tuple of single
|
101 |
+
string: (text_a,).
|
102 |
+
|
103 |
+
Returns
|
104 |
+
-------
|
105 |
+
np.array: input token ids in 'int32', shape (batch_size, seq_length)
|
106 |
+
np.array: valid length in 'int32', shape (batch_size,)
|
107 |
+
np.array: input token type ids in 'int32', shape (batch_size, seq_length)
|
108 |
+
|
109 |
+
"""
|
110 |
+
|
111 |
+
# convert to unicode
|
112 |
+
text_a = line[0]
|
113 |
+
if self._pair:
|
114 |
+
assert len(line) == 2
|
115 |
+
text_b = line[1]
|
116 |
+
|
117 |
+
tokens_a = self._tokenizer.tokenize(text_a)
|
118 |
+
tokens_b = None
|
119 |
+
|
120 |
+
if self._pair:
|
121 |
+
tokens_b = self._tokenizer(text_b)
|
122 |
+
|
123 |
+
if tokens_b:
|
124 |
+
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
125 |
+
# length is less than the specified length.
|
126 |
+
# Account for [CLS], [SEP], [SEP] with "- 3"
|
127 |
+
self._truncate_seq_pair(tokens_a, tokens_b,
|
128 |
+
self._max_seq_length - 3)
|
129 |
+
else:
|
130 |
+
# Account for [CLS] and [SEP] with "- 2"
|
131 |
+
if len(tokens_a) > self._max_seq_length - 2:
|
132 |
+
tokens_a = tokens_a[0:(self._max_seq_length - 2)]
|
133 |
+
|
134 |
+
# The embedding vectors for `type=0` and `type=1` were learned during
|
135 |
+
# pre-training and are added to the wordpiece embedding vector
|
136 |
+
# (and position vector). This is not *strictly* necessary since
|
137 |
+
# the [SEP] token unambiguously separates the sequences, but it makes
|
138 |
+
# it easier for the model to learn the concept of sequences.
|
139 |
+
|
140 |
+
# For classification tasks, the first vector (corresponding to [CLS]) is
|
141 |
+
# used as as the "sentence vector". Note that this only makes sense because
|
142 |
+
# the entire model is fine-tuned.
|
143 |
+
#vocab = self._tokenizer.vocab
|
144 |
+
vocab = self._vocab
|
145 |
+
tokens = []
|
146 |
+
tokens.append(vocab.cls_token)
|
147 |
+
tokens.extend(tokens_a)
|
148 |
+
tokens.append(vocab.sep_token)
|
149 |
+
segment_ids = [0] * len(tokens)
|
150 |
+
|
151 |
+
if tokens_b:
|
152 |
+
tokens.extend(tokens_b)
|
153 |
+
tokens.append(vocab.sep_token)
|
154 |
+
segment_ids.extend([1] * (len(tokens) - len(segment_ids)))
|
155 |
+
|
156 |
+
input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
|
157 |
+
|
158 |
+
# The valid length of sentences. Only real tokens are attended to.
|
159 |
+
valid_length = len(input_ids)
|
160 |
+
|
161 |
+
if self._pad:
|
162 |
+
# Zero-pad up to the sequence length.
|
163 |
+
padding_length = self._max_seq_length - valid_length
|
164 |
+
# use padding tokens for the rest
|
165 |
+
input_ids.extend([vocab[vocab.padding_token]] * padding_length)
|
166 |
+
segment_ids.extend([0] * padding_length)
|
167 |
+
|
168 |
+
return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
|
169 |
+
np.array(segment_ids, dtype='int32')
|
170 |
+
|
171 |
class BERTDataset(Dataset):
|
172 |
def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
|
173 |
pad, pair):
|