dazzleun-7 commited on
Commit
94f9674
Β·
verified Β·
1 Parent(s): b04e9c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -0
app.py CHANGED
@@ -32,6 +32,142 @@ else:
32
  device = torch.device("cpu")
33
  print('No GPU available, using the CPU instead.')
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  class BERTDataset(Dataset):
36
  def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
37
  pad, pair):
 
32
  device = torch.device("cpu")
33
  print('No GPU available, using the CPU instead.')
34
 
35
+
36
+ class BERTSentenceTransform:
37
+ r"""BERT style data transformation.
38
+
39
+ Parameters
40
+ ----------
41
+ tokenizer : BERTTokenizer.
42
+ Tokenizer for the sentences.
43
+ max_seq_length : int.
44
+ Maximum sequence length of the sentences.
45
+ pad : bool, default True
46
+ Whether to pad the sentences to maximum length.
47
+ pair : bool, default True
48
+ Whether to transform sentences or sentence pairs.
49
+ """
50
+
51
+ # μž…λ ₯으둜 받은 tokenizerm μ΅œλŒ€ μ‹œν€€μŠ€ 길이, vocab, pad 및 pair μ„€μ •
52
+ def __init__(self, tokenizer, max_seq_length,vocab, pad=True, pair=True):
53
+ self._tokenizer = tokenizer
54
+ self._max_seq_length = max_seq_length
55
+ self._pad = pad
56
+ self._pair = pair
57
+ self._vocab = vocab
58
+
59
+ # μž…λ ₯된 λ¬Έμž₯ λ˜λŠ” λ¬Έμž₯ μŒμ„ BERT λͺ¨λΈμ΄ μ‚¬μš©ν•  수 μžˆλŠ” ν˜•μ‹μœΌλ‘œ λ³€ν™˜
60
+ def __call__(self, line):
61
+ """Perform transformation for sequence pairs or single sequences.
62
+
63
+ The transformation is processed in the following steps:
64
+ - tokenize the input sequences
65
+ - insert [CLS], [SEP] as necessary
66
+ - generate type ids to indicate whether a token belongs to the first
67
+ sequence or the second sequence.
68
+ - generate valid length
69
+
70
+ For sequence pairs, the input is a tuple of 2 strings:
71
+ text_a, text_b.
72
+
73
+ Inputs:
74
+ text_a: 'is this jacksonville ?'
75
+ text_b: 'no it is not'
76
+ Tokenization:
77
+ text_a: 'is this jack ##son ##ville ?'
78
+ text_b: 'no it is not .'
79
+ Processed:
80
+ tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
81
+ type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
82
+ valid_length: 14
83
+
84
+ For single sequences, the input is a tuple of single string:
85
+ text_a.
86
+
87
+ Inputs:
88
+ text_a: 'the dog is hairy .'
89
+ Tokenization:
90
+ text_a: 'the dog is hairy .'
91
+ Processed:
92
+ text_a: '[CLS] the dog is hairy . [SEP]'
93
+ type_ids: 0 0 0 0 0 0 0
94
+ valid_length: 7
95
+
96
+ Parameters
97
+ ----------
98
+ line: tuple of str
99
+ Input strings. For sequence pairs, the input is a tuple of 2 strings:
100
+ (text_a, text_b). For single sequences, the input is a tuple of single
101
+ string: (text_a,).
102
+
103
+ Returns
104
+ -------
105
+ np.array: input token ids in 'int32', shape (batch_size, seq_length)
106
+ np.array: valid length in 'int32', shape (batch_size,)
107
+ np.array: input token type ids in 'int32', shape (batch_size, seq_length)
108
+
109
+ """
110
+
111
+ # convert to unicode
112
+ text_a = line[0]
113
+ if self._pair:
114
+ assert len(line) == 2
115
+ text_b = line[1]
116
+
117
+ tokens_a = self._tokenizer.tokenize(text_a)
118
+ tokens_b = None
119
+
120
+ if self._pair:
121
+ tokens_b = self._tokenizer(text_b)
122
+
123
+ if tokens_b:
124
+ # Modifies `tokens_a` and `tokens_b` in place so that the total
125
+ # length is less than the specified length.
126
+ # Account for [CLS], [SEP], [SEP] with "- 3"
127
+ self._truncate_seq_pair(tokens_a, tokens_b,
128
+ self._max_seq_length - 3)
129
+ else:
130
+ # Account for [CLS] and [SEP] with "- 2"
131
+ if len(tokens_a) > self._max_seq_length - 2:
132
+ tokens_a = tokens_a[0:(self._max_seq_length - 2)]
133
+
134
+ # The embedding vectors for `type=0` and `type=1` were learned during
135
+ # pre-training and are added to the wordpiece embedding vector
136
+ # (and position vector). This is not *strictly* necessary since
137
+ # the [SEP] token unambiguously separates the sequences, but it makes
138
+ # it easier for the model to learn the concept of sequences.
139
+
140
+ # For classification tasks, the first vector (corresponding to [CLS]) is
141
+ # used as as the "sentence vector". Note that this only makes sense because
142
+ # the entire model is fine-tuned.
143
+ #vocab = self._tokenizer.vocab
144
+ vocab = self._vocab
145
+ tokens = []
146
+ tokens.append(vocab.cls_token)
147
+ tokens.extend(tokens_a)
148
+ tokens.append(vocab.sep_token)
149
+ segment_ids = [0] * len(tokens)
150
+
151
+ if tokens_b:
152
+ tokens.extend(tokens_b)
153
+ tokens.append(vocab.sep_token)
154
+ segment_ids.extend([1] * (len(tokens) - len(segment_ids)))
155
+
156
+ input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
157
+
158
+ # The valid length of sentences. Only real tokens are attended to.
159
+ valid_length = len(input_ids)
160
+
161
+ if self._pad:
162
+ # Zero-pad up to the sequence length.
163
+ padding_length = self._max_seq_length - valid_length
164
+ # use padding tokens for the rest
165
+ input_ids.extend([vocab[vocab.padding_token]] * padding_length)
166
+ segment_ids.extend([0] * padding_length)
167
+
168
+ return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
169
+ np.array(segment_ids, dtype='int32')
170
+
171
  class BERTDataset(Dataset):
172
  def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
173
  pad, pair):