Spaces:

dazzleun-7
/

Bigdatacapstone_24-2

Sleeping

App Files Files Community

dazzleun-7 commited on Dec 19, 2024

Commit

94f9674

verified ·

1 Parent(s): b04e9c4

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -0

app.py CHANGED Viewed

@@ -32,6 +32,142 @@ else:
     device = torch.device("cpu")
     print('No GPU available, using the CPU instead.')
 class BERTDataset(Dataset):
     def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                  pad, pair):

     device = torch.device("cpu")
     print('No GPU available, using the CPU instead.')
+class BERTSentenceTransform:
+    r"""BERT style data transformation.
+    Parameters
+    ----------
+    tokenizer : BERTTokenizer.
+        Tokenizer for the sentences.
+    max_seq_length : int.
+        Maximum sequence length of the sentences.
+    pad : bool, default True
+        Whether to pad the sentences to maximum length.
+    pair : bool, default True
+        Whether to transform sentences or sentence pairs.
+    """
+    # 입력으로 받은 tokenizerm 최대 시퀀스 길이, vocab, pad 및 pair 설정
+    def __init__(self, tokenizer, max_seq_length,vocab, pad=True, pair=True):
+        self._tokenizer = tokenizer
+        self._max_seq_length = max_seq_length
+        self._pad = pad
+        self._pair = pair
+        self._vocab = vocab
+    # 입력된 문장 또는 문장 쌍을 BERT 모델이 사용할 수 있는 형식으로 변환
+    def __call__(self, line):
+        """Perform transformation for sequence pairs or single sequences.
+        The transformation is processed in the following steps:
+        - tokenize the input sequences
+        - insert [CLS], [SEP] as necessary
+        - generate type ids to indicate whether a token belongs to the first
+        sequence or the second sequence.
+        - generate valid length
+        For sequence pairs, the input is a tuple of 2 strings:
+        text_a, text_b.
+        Inputs:
+            text_a: 'is this jacksonville ?'
+            text_b: 'no it is not'
+        Tokenization:
+            text_a: 'is this jack ##son ##ville ?'
+            text_b: 'no it is not .'
+        Processed:
+            tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
+            type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+            valid_length: 14
+        For single sequences, the input is a tuple of single string:
+        text_a.
+        Inputs:
+            text_a: 'the dog is hairy .'
+        Tokenization:
+            text_a: 'the dog is hairy .'
+        Processed:
+            text_a: '[CLS] the dog is hairy . [SEP]'
+            type_ids: 0     0   0   0  0     0 0
+            valid_length: 7
+        Parameters
+        ----------
+        line: tuple of str
+            Input strings. For sequence pairs, the input is a tuple of 2 strings:
+            (text_a, text_b). For single sequences, the input is a tuple of single
+            string: (text_a,).
+        Returns
+        -------
+        np.array: input token ids in 'int32', shape (batch_size, seq_length)
+        np.array: valid length in 'int32', shape (batch_size,)
+        np.array: input token type ids in 'int32', shape (batch_size, seq_length)
+        """
+        # convert to unicode
+        text_a = line[0]
+        if self._pair:
+            assert len(line) == 2
+            text_b = line[1]
+        tokens_a = self._tokenizer.tokenize(text_a)
+        tokens_b = None
+        if self._pair:
+            tokens_b = self._tokenizer(text_b)
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            self._truncate_seq_pair(tokens_a, tokens_b,
+                                    self._max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > self._max_seq_length - 2:
+                tokens_a = tokens_a[0:(self._max_seq_length - 2)]
+        # The embedding vectors for `type=0` and `type=1` were learned during
+        # pre-training and are added to the wordpiece embedding vector
+        # (and position vector). This is not *strictly* necessary since
+        # the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        #vocab = self._tokenizer.vocab
+        vocab = self._vocab
+        tokens = []
+        tokens.append(vocab.cls_token)
+        tokens.extend(tokens_a)
+        tokens.append(vocab.sep_token)
+        segment_ids = [0] * len(tokens)
+        if tokens_b:
+            tokens.extend(tokens_b)
+            tokens.append(vocab.sep_token)
+            segment_ids.extend([1] * (len(tokens) - len(segment_ids)))
+        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
+        # The valid length of sentences. Only real  tokens are attended to.
+        valid_length = len(input_ids)
+        if self._pad:
+            # Zero-pad up to the sequence length.
+            padding_length = self._max_seq_length - valid_length
+            # use padding tokens for the rest
+            input_ids.extend([vocab[vocab.padding_token]] * padding_length)
+            segment_ids.extend([0] * padding_length)
+        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
+            np.array(segment_ids, dtype='int32')
 class BERTDataset(Dataset):
     def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                  pad, pair):