madhavanvenkatesh
commited on
Refactor: Convert mask_token_id, pad_token_id, and all_special_ids to properties
Browse files
geneformer/collator_for_classification.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
"""
|
2 |
Geneformer collator for gene and cell classification.
|
3 |
-
|
4 |
Huggingface data collator modified to accommodate single-cell transcriptomics data for gene and cell classification.
|
5 |
"""
|
6 |
|
@@ -85,13 +84,25 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
|
|
85 |
self.token_dictionary = kwargs.get("token_dictionary")
|
86 |
self.padding_side = "right"
|
87 |
self.model_input_names = ["input_ids"]
|
88 |
-
self.
|
89 |
-
self.
|
90 |
-
self.
|
91 |
self.token_dictionary.get("<mask>"),
|
92 |
self.token_dictionary.get("<pad>"),
|
93 |
]
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
def _get_padding_truncation_strategies(
|
96 |
self,
|
97 |
padding=True,
|
@@ -258,29 +269,23 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
|
|
258 |
"""
|
259 |
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
|
260 |
in the batch.
|
261 |
-
|
262 |
Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
|
263 |
``self.pad_token_id`` and ``self.pad_token_type_id``)
|
264 |
-
|
265 |
.. note::
|
266 |
-
|
267 |
If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
|
268 |
result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
|
269 |
case of PyTorch tensors, you will lose the specific device of your tensors however.
|
270 |
-
|
271 |
Args:
|
272 |
encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
|
273 |
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
|
274 |
List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
|
275 |
List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
|
276 |
well as in a PyTorch Dataloader collate function.
|
277 |
-
|
278 |
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
|
279 |
see the note above for the return type.
|
280 |
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
281 |
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
282 |
index) among:
|
283 |
-
|
284 |
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
285 |
single sequence if provided).
|
286 |
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
@@ -291,17 +296,14 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
|
|
291 |
Maximum length of the returned list and optionally padding length (see above).
|
292 |
pad_to_multiple_of (:obj:`int`, `optional`):
|
293 |
If set will pad the sequence to a multiple of the provided value.
|
294 |
-
|
295 |
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
296 |
>= 7.5 (Volta).
|
297 |
return_attention_mask (:obj:`bool`, `optional`):
|
298 |
Whether to return the attention mask. If left to the default, will return the attention mask according
|
299 |
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
|
300 |
-
|
301 |
`What are attention masks? <../glossary.html#attention-mask>`__
|
302 |
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
303 |
If set, will return tensors instead of list of python integers. Acceptable values are:
|
304 |
-
|
305 |
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
306 |
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
|
307 |
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
|
@@ -418,18 +420,15 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
|
|
418 |
) -> dict:
|
419 |
"""
|
420 |
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
|
421 |
-
|
422 |
Args:
|
423 |
encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
|
424 |
max_length: maximum length of the returned list and optionally padding length (see below).
|
425 |
Will truncate by taking into account the special tokens.
|
426 |
padding_strategy: PaddingStrategy to use for padding.
|
427 |
-
|
428 |
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
429 |
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
430 |
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
431 |
The tokenizer padding sides are defined in self.padding_side:
|
432 |
-
|
433 |
- 'left': pads on the left of the sequences
|
434 |
- 'right': pads on the right of the sequences
|
435 |
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
|
|
1 |
"""
|
2 |
Geneformer collator for gene and cell classification.
|
|
|
3 |
Huggingface data collator modified to accommodate single-cell transcriptomics data for gene and cell classification.
|
4 |
"""
|
5 |
|
|
|
84 |
self.token_dictionary = kwargs.get("token_dictionary")
|
85 |
self.padding_side = "right"
|
86 |
self.model_input_names = ["input_ids"]
|
87 |
+
self._mask_token_id = self.token_dictionary.get("<mask>")
|
88 |
+
self._pad_token_id = self.token_dictionary.get("<pad>")
|
89 |
+
self._all_special_ids = [
|
90 |
self.token_dictionary.get("<mask>"),
|
91 |
self.token_dictionary.get("<pad>"),
|
92 |
]
|
93 |
|
94 |
+
@property
|
95 |
+
def all_special_ids(self):
|
96 |
+
return self._all_special_ids
|
97 |
+
|
98 |
+
@property
|
99 |
+
def mask_token_id(self):
|
100 |
+
return self._mask_token_id
|
101 |
+
|
102 |
+
@property
|
103 |
+
def pad_token_id(self):
|
104 |
+
return self._pad_token_id
|
105 |
+
|
106 |
def _get_padding_truncation_strategies(
|
107 |
self,
|
108 |
padding=True,
|
|
|
269 |
"""
|
270 |
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
|
271 |
in the batch.
|
|
|
272 |
Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
|
273 |
``self.pad_token_id`` and ``self.pad_token_type_id``)
|
|
|
274 |
.. note::
|
|
|
275 |
If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
|
276 |
result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
|
277 |
case of PyTorch tensors, you will lose the specific device of your tensors however.
|
|
|
278 |
Args:
|
279 |
encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
|
280 |
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
|
281 |
List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
|
282 |
List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
|
283 |
well as in a PyTorch Dataloader collate function.
|
|
|
284 |
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
|
285 |
see the note above for the return type.
|
286 |
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
287 |
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
288 |
index) among:
|
|
|
289 |
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
290 |
single sequence if provided).
|
291 |
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
|
|
296 |
Maximum length of the returned list and optionally padding length (see above).
|
297 |
pad_to_multiple_of (:obj:`int`, `optional`):
|
298 |
If set will pad the sequence to a multiple of the provided value.
|
|
|
299 |
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
300 |
>= 7.5 (Volta).
|
301 |
return_attention_mask (:obj:`bool`, `optional`):
|
302 |
Whether to return the attention mask. If left to the default, will return the attention mask according
|
303 |
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
|
|
|
304 |
`What are attention masks? <../glossary.html#attention-mask>`__
|
305 |
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
306 |
If set, will return tensors instead of list of python integers. Acceptable values are:
|
|
|
307 |
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
308 |
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
|
309 |
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
|
|
|
420 |
) -> dict:
|
421 |
"""
|
422 |
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
|
|
|
423 |
Args:
|
424 |
encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
|
425 |
max_length: maximum length of the returned list and optionally padding length (see below).
|
426 |
Will truncate by taking into account the special tokens.
|
427 |
padding_strategy: PaddingStrategy to use for padding.
|
|
|
428 |
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
429 |
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
430 |
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
431 |
The tokenizer padding sides are defined in self.padding_side:
|
|
|
432 |
- 'left': pads on the left of the sequences
|
433 |
- 'right': pads on the right of the sequences
|
434 |
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|