ligeti commited on
Commit
a812259
·
verified ·
1 Parent(s): 35764e0

Upload tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +1 -1
  2. tokenizer.py +450 -0
  3. tokenizer_config.json +53 -3
  4. vocab.txt +0 -1
special_tokens_map.json CHANGED
@@ -3,5 +3,5 @@
3
  "mask_token": "[MASK]",
4
  "pad_token": "[PAD]",
5
  "sep_token": "[SEP]",
6
- "unk_token": "N"
7
  }
 
3
  "mask_token": "[MASK]",
4
  "pad_token": "[PAD]",
5
  "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
  }
tokenizer.py ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ import os
3
+ import json
4
+ from copy import deepcopy
5
+ from typing import List, Optional, Tuple, Dict, Set
6
+ from transformers import PreTrainedTokenizer
7
+ from transformers.utils import logging
8
+ from itertools import product
9
+ logger = logging.get_logger(__name__)
10
+
11
+
12
+
13
+ #from .config_utils import SeqConfig
14
+ #from .sequtils import generate_kmers, lca_kmer_tokenize_segment
15
+
16
+ # Define the names of the vocabulary files
17
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
18
+
19
+ # Define the mapping for pretrained vocabulary files
20
+ PRETRAINED_VOCAB_FILES_MAP = {
21
+ "vocab_file": {
22
+ "lca-mini-k6s1": "lca-base-dna6/vocab.txt",
23
+ "lca-mini-k6s2": "lca-base-dna6/vocab.txt",
24
+ "lca-mini-k1s1": "lca-base-dna1/vocab.txt",
25
+ }
26
+ }
27
+
28
+ # Define positional embedding sizes for pretrained models
29
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
30
+ "lca-mini-k6s1": 1024,
31
+ "lca-mini-k1s1": 1024,
32
+ "lca-mini-k6s2": 2048,
33
+ }
34
+
35
+ # Define initial configuration for pretrained models
36
+ PRETRAINED_INIT_CONFIGURATION = {
37
+ "lca-mini-k6s1": {"do_upper_case": True},
38
+ "lca-mini-k1s1": {"do_upper_case": True},
39
+ "lca-mini-k6s2": {"do_upper_case": True},
40
+ }
41
+
42
+ def generate_kmers(abc: Set[str], k: int) -> List[str]:
43
+ """
44
+ Generates all possible k-mers from a given alphabet.
45
+
46
+ :param abc: The alphabet.
47
+ :type abc: Set[str]
48
+ :param k: Length of the k-mers.
49
+ :type k: int
50
+ :return: List of all possible k-mers.
51
+ :rtype: List[str]
52
+ """
53
+ return [''.join(p) for p in product(abc, repeat=k)]
54
+
55
+
56
+ # Utility function to load vocabulary from a file
57
+ def load_vocab(vocab_file):
58
+ """Loads a vocabulary file into a dictionary."""
59
+ vocab = collections.OrderedDict()
60
+ with open(vocab_file, "r", encoding="utf-8") as reader:
61
+ tokens = reader.readlines()
62
+ for index, token in enumerate(tokens):
63
+ vocab[token.rstrip("\n")] = index
64
+ return vocab
65
+
66
+
67
+ def resolve_vocab_file(vocab_file: Optional[str], kmer) -> str:
68
+ """
69
+ Resolves the path to the vocabulary file. If not provided, tries to load it
70
+ from the installed prokbert package or download it from the GitHub repository.
71
+
72
+ Args:
73
+ vocab_file (str, optional): Path to the vocabulary file.
74
+
75
+ Returns:
76
+ str: Path to the resolved vocabulary file.
77
+
78
+ Raises:
79
+ FileNotFoundError: If the vocabulary file cannot be resolved.
80
+ """
81
+ if vocab_file and os.path.exists(vocab_file):
82
+ return vocab_file
83
+
84
+ # Attempt 1: Check if prokbert is installed
85
+ try:
86
+ import prokbert
87
+ package_dir = os.path.dirname(prokbert.__file__)
88
+ vocab_path = os.path.join(package_dir, 'data/prokbert_vocabs/', f'prokbert-base-dna{kmer}', 'vocab.txt')
89
+
90
+ print(vocab_path)
91
+ #vocabfile_path = join(self.current_path, 'data/prokbert_vocabs/', f'prokbert-base-dna{act_kmer}', 'vocab.txt')
92
+
93
+
94
+ if os.path.exists(vocab_path):
95
+ logger.info(f"Loaded vocab file from installed prokbert package: {vocab_path}")
96
+ return vocab_path
97
+ except ImportError:
98
+ logger.info("Prokbert package not installed, proceeding to download vocab.txt.")
99
+
100
+ # Attempt 2: Download from GitHub repository
101
+ github_url = "https://raw.githubusercontent.com/username/prokbert/main/vocab.txt"
102
+ temp_vocab_path = os.path.join(os.getcwd(), "vocab.txt")
103
+ try:
104
+ import requests
105
+
106
+ response = requests.get(github_url, timeout=10)
107
+ response.raise_for_status() # Raise an error for HTTP failures
108
+ with open(temp_vocab_path, "w", encoding="utf-8") as f:
109
+ f.write(response.text)
110
+ logger.info(f"Downloaded vocab.txt from GitHub to: {temp_vocab_path}")
111
+ return temp_vocab_path
112
+ except requests.RequestException as e:
113
+ raise FileNotFoundError(
114
+ "Could not find or download vocab.txt. Ensure prokbert is installed or "
115
+ "provide a valid vocab file path. Error: {e}"
116
+ ) from e
117
+
118
+
119
+ class LCATokenizer(PreTrainedTokenizer):
120
+ """
121
+ Custom tokenizer for LCA (Local Context Aware) tasks.
122
+ Handles specific tokenization processes, including k-mer tokenization with configurable shifts.
123
+
124
+ Attributes:
125
+ vocab_files_names (dict): Mapping of vocabulary file names.
126
+ pretrained_vocab_files_map (dict): Mapping of pretrained vocabulary files.
127
+ pretrained_init_configuration (dict): Initial configuration for pretrained models.
128
+ max_model_input_sizes (dict): Maximum input sizes for pretrained models.
129
+ """
130
+
131
+ vocab_files_names = VOCAB_FILES_NAMES
132
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
133
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
134
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
135
+
136
+ nucleotide_abc = {"A", "T", "C", "G"}
137
+ extended_nucleotide_abc = {"A", "T", "C", "G", "*"}
138
+ sequence_unk_token = 'N'
139
+
140
+ default_unk_token = "[UNK]"
141
+ default_sep_token = "[SEP]"
142
+ default_pad_token = "[PAD]"
143
+ default_cls_token = "[CLS]"
144
+ default_mask_token = "[MASK]"
145
+
146
+ vocab_files_names = {"vocab_file": "vocab.txt"}
147
+
148
+
149
+
150
+ def __init__(
151
+ self,
152
+ vocab_file: Optional[str] = None,
153
+ kmer: int = 6,
154
+ shift: int = 1,
155
+ operation_space: str = "kmer",
156
+ **kwargs,
157
+ ):
158
+ """
159
+ Initializes the LCATokenizer.
160
+
161
+ Args:
162
+ vocab_file (str): Path to the vocabulary file.
163
+ kmer (int): K-mer size for tokenization.
164
+ shift (int): Shift size for tokenization.
165
+ operation_space (str): Defines operation mode ('kmer' or 'sequence').
166
+ kwargs: Additional arguments for PreTrainedTokenizer.
167
+ """
168
+ # Load vocabulary directly from the vocab file
169
+ self.config = {}
170
+ resolved_vocab_file = resolve_vocab_file(vocab_file, kmer)
171
+ self.vocab = load_vocab(resolved_vocab_file)
172
+ #self.vocab = load_vocab(vocab_file)
173
+ self.id2token = {v: k for k, v in self.vocab.items()}
174
+ self.kmer = kmer
175
+ self.shift = shift
176
+ self.operation_space = operation_space
177
+
178
+ self.config["kmer"] = kmer
179
+ self.config["shift"] = shift
180
+ self.config["operation_space"] = operation_space
181
+
182
+ # Special tokens
183
+ kwargs.setdefault("cls_token", "[CLS]")
184
+ kwargs.setdefault("sep_token", "[SEP]")
185
+ kwargs.setdefault("pad_token", "[PAD]")
186
+ kwargs.setdefault("unk_token", "[UNK]")
187
+ kwargs.setdefault("mask_token", "[MASK]")
188
+ self.special_tokens = [kwargs["cls_token"], kwargs["sep_token"], kwargs["pad_token"], kwargs["unk_token"], kwargs["mask_token"]]
189
+ super().__init__(**kwargs)
190
+ if self.operation_space == 'sequence':
191
+ token_extension = sorted(list(set(generate_kmers(LCATokenizer.extended_nucleotide_abc, self.config['kmer'])) - \
192
+ set(generate_kmers(LCATokenizer.nucleotide_abc, self.config['kmer'])) ))
193
+ self.extended_vocab = deepcopy(self.vocab)
194
+ for token in token_extension:
195
+ self.extended_vocab[token] = 4
196
+
197
+ self.unk_token = LCATokenizer.sequence_unk_token * self.config['shift']
198
+ self.mask_token = '*'
199
+ self.extended_vocab[self.mask_token] = self.vocab['[MASK]']
200
+
201
+ full_unk = 'N' * self.config['kmer']
202
+ self.vocab[full_unk] = 1
203
+ self.id2token[1] = full_unk
204
+ self.full_unk_token = full_unk
205
+
206
+ else:
207
+ self.extended_vocab = self.vocab
208
+ self.unk_token = '[UNK]'
209
+
210
+ self.unkown_tokenid = self.vocab['[UNK]']
211
+ self.sep_token = '[SEP]'
212
+ self.cls_token = '[CLS]'
213
+ self.pad_token = '[PAD]'
214
+ self.mask_token = '[MASK]'
215
+ self.special_tokens = list(self.special_tokens_map.values())
216
+
217
+
218
+ def get_vocab(self) -> Dict[str, int]:
219
+ return self.vocab
220
+
221
+
222
+ def _tokenize(self, text, **kwargs):
223
+ """
224
+ Tokenizes the input text using LCA tokenization with an optional offset.
225
+
226
+ Args:
227
+ text (str): The input DNA sequence to tokenize.
228
+ kwargs: Additional arguments, including:
229
+ - offset (int): The starting position for tokenization. Default is 0.
230
+
231
+ Returns:
232
+ List[str]: A list of tokens generated from the input text.
233
+ """
234
+ offset = kwargs.get("offset", 0)
235
+ #if offset < 0 or offset >= self.config.get("shift", 1):
236
+ # raise ValueError(f"Invalid offset: {offset}. Must be between 0 and {self.config['shift'] - 1}.")
237
+
238
+ return self.lca_kmer_tokenize_segment(text, offset)
239
+
240
+ def _convert_token_to_id(self, token: str) -> int:
241
+ """
242
+ Converts a token to its corresponding ID using the vocabulary.
243
+
244
+ Args:
245
+ token (str): The token to convert.
246
+
247
+ Returns:
248
+ int: Token ID, or the unknown token ID if the token is not in the vocabulary.
249
+ """
250
+ return self.extended_vocab.get(token, self.unkown_tokenid)
251
+
252
+ def _convert_id_to_token(self, index: int) -> str:
253
+ """
254
+ Converts an ID to its corresponding token using the vocabulary.
255
+
256
+ Args:
257
+ index (int): The ID to convert.
258
+
259
+ Returns:
260
+ str: Corresponding token, or the unknown token if the ID is not in the vocabulary.
261
+ """
262
+
263
+
264
+ return self.id2token.get(index, self.unk_token)
265
+
266
+ def __len__(self) -> int:
267
+ """
268
+ Returns the length of the tokenizer's vocabulary.
269
+
270
+ The length returned is one less than the actual number of items in the vocabulary
271
+ to account for a specific offset or adjustment in token indexing.
272
+
273
+ :return: The adjusted length of the vocabulary.
274
+ :rtype: int
275
+ """
276
+ return len(self.vocab)
277
+
278
+ def lca_kmer_tokenize_segment(self, segment: str, offset: int):
279
+ # calculate the tokenization for one offset value
280
+ shift = self.shift
281
+ kmer = self.kmer
282
+ #max_segment_length = params['max_segment_length']
283
+ #max_unknown_token_proportion = params['max_unknown_token_proportion']
284
+ #kmer = params['kmer']
285
+ #token_limit = params['token_limit']
286
+ #vocabmap = params['vocabmap']
287
+ #add_special_token = params['add_special_token']
288
+ #if len(segment) > max_segment_length:
289
+ # raise(ValueError(f'The segment is longer {len(segment)} then the maximum allowed segment length ({max_segment_length}). '))
290
+
291
+ kmers = [segment[i:i + kmer] for i in range(offset, len(segment) - kmer + 1, shift)]
292
+
293
+ return kmers
294
+
295
+ def tokenize(self, text: str, **kwargs) -> List[str]:
296
+ """
297
+ Tokenizes the input text using LCA tokenization.
298
+
299
+ Args:
300
+ text (str): The input DNA sequence to tokenize.
301
+ kwargs: Additional arguments, including:
302
+ - offset (int): The starting position for tokenization. Default is 0.
303
+
304
+ Returns:
305
+ List[str]: A list of tokens generated from the input text.
306
+ """
307
+ return self._tokenize(text, **kwargs)
308
+
309
+ def encode(self, text: str, **kwargs) -> List[int]:
310
+ """
311
+ Extends the base `encode` method to support an `offset` parameter for custom tokenization logic.
312
+
313
+ Args:
314
+ text (str): Input text (DNA sequence).
315
+ offset (int): Offset parameter for the LCA tokenization. Defaults to 0.
316
+ kwargs: Additional arguments passed to the base `encode` method.
317
+
318
+ Returns:
319
+ List[int]: Encoded token IDs.
320
+ """
321
+ # Inject the offset into kwargs for the tokenizer
322
+ offset = kwargs.get("offset", 0)
323
+ kwargs["offset"] = offset
324
+ return super().encode(text, **kwargs)
325
+
326
+ def build_inputs_with_special_tokens(
327
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
328
+ ) -> List[int]:
329
+ """
330
+ Builds inputs by adding special tokens to a sequence or pair of sequences.
331
+
332
+ Args:
333
+ token_ids_0 (List[int]): List of token IDs for the first sequence.
334
+ token_ids_1 (List[int], optional): List of token IDs for the second sequence.
335
+
336
+ Returns:
337
+ List[int]: Input IDs with special tokens.
338
+ """
339
+ if token_ids_1 is None:
340
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
341
+
342
+ input_ids = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id]
343
+ #token_type_ids = [0 for i in range(len(input_ids))]
344
+ return input_ids
345
+
346
+ def create_token_type_ids_from_sequences(
347
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
348
+ ) -> List[int]:
349
+ """
350
+ Create the token type IDs corresponding to the sequences passed. [What are token type
351
+ IDs?](../glossary#token-type-ids)
352
+
353
+ Should be overridden in a subclass if the model has a special way of building those.
354
+
355
+ Args:
356
+ token_ids_0 (`List[int]`): The first tokenized sequence.
357
+ token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
358
+
359
+ Returns:
360
+ `List[int]`: The token type ids.
361
+ """
362
+ if token_ids_1 is None:
363
+ return (len(token_ids_0)+2) * [0]
364
+ return [0] * len(token_ids_0) + [1] * len(token_ids_1)
365
+
366
+ def batch_encode_plus(self, *args, **kwargs):
367
+ """
368
+ Extends the base `batch_encode_plus` method to add custom functionality if needed.
369
+
370
+ Args:
371
+ *args: Positional arguments passed to the base method.
372
+ **kwargs: Keyword arguments passed to the base method.
373
+
374
+ Returns:
375
+ dict: A dictionary containing the results of batch encoding.
376
+ """
377
+ # Call the parent method to handle the batch encoding
378
+ #print('Running batch encoding with ids')
379
+ act_outputs = super().batch_encode_plus(*args, **kwargs)
380
+ return act_outputs
381
+
382
+
383
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
384
+ """
385
+ Saves the tokenizer's vocabulary to a file.
386
+
387
+ Args:
388
+ save_directory (str): Directory to save the vocabulary file.
389
+ filename_prefix (str, optional): Prefix for the filename. Default is None.
390
+
391
+ Returns:
392
+ Tuple[str]: Path to the saved vocabulary file.
393
+ """
394
+ if filename_prefix is None:
395
+ filename_prefix = ""
396
+ vocab_file_path = os.path.join(save_directory, filename_prefix + "vocab.txt")
397
+ with open(vocab_file_path, "w") as f:
398
+ for token in self.vocab:
399
+ f.write(token + "\n")
400
+ return (vocab_file_path,)
401
+
402
+
403
+ @property
404
+ def vocab_size(self) -> int:
405
+ """
406
+ Returns the size of the vocabulary (number of tokens in `vocab.txt`).
407
+
408
+ Returns:
409
+ int: The size of the vocabulary.
410
+ """
411
+ return len(self.vocab)
412
+
413
+ def save_pretrained(self, save_directory: str, **kwargs):
414
+ """
415
+ Save the tokenizer configuration and vocabulary to a directory.
416
+
417
+ Args:
418
+ save_directory (str): Directory to save the tokenizer files.
419
+ kwargs: Additional arguments for saving.
420
+ """
421
+ if not os.path.exists(save_directory):
422
+ os.makedirs(save_directory)
423
+
424
+ # Save the base tokenizer configuration
425
+ super().save_pretrained(save_directory, **kwargs)
426
+
427
+ # Path to the tokenizer configuration file
428
+ tokenizer_config_path = os.path.join(save_directory, "tokenizer_config.json")
429
+
430
+ # Load the existing configuration or create a new one
431
+ if os.path.exists(tokenizer_config_path):
432
+ with open(tokenizer_config_path, "r", encoding="utf-8") as f:
433
+ tokenizer_config = json.load(f)
434
+ else:
435
+ tokenizer_config = {}
436
+
437
+
438
+ # Add custom fields for AutoTokenizer and remote code
439
+ #tokenizer_config["auto_map"] = {
440
+ # "AutoTokenizer": "src.prokbert.tokenizer.LCATokenizer"
441
+ #}
442
+ #tokenizer_config["repository"] = "https://github.com/nbrg-ppcu/prokbert"
443
+ #tokenizer_config["trust_remote_code"] = True
444
+ tokenizer_config["kmer"] = self.kmer
445
+ tokenizer_config["shift"] = self.shift
446
+ tokenizer_config["operation_space"] = self.operation_space
447
+ # Save the updated configuration
448
+ with open(tokenizer_config_path, "w", encoding="utf-8") as f:
449
+ json.dump(tokenizer_config, f, indent=2)
450
+
tokenizer_config.json CHANGED
@@ -1,6 +1,55 @@
1
  {
2
- "clean_up_tokenization_spaces": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "cls_token": "[CLS]",
 
4
  "mask_token": "[MASK]",
5
  "model_max_length": 1000000000000000019884624838656,
6
  "pad_token": "[PAD]",
@@ -8,5 +57,6 @@
8
  "tokenizer_class": "LCATokenizer",
9
  "unk_token": "[UNK]",
10
  "kmer": 6,
11
- "shift": 1
12
- }
 
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenizer.LCATokenizer",
47
+ null
48
+ ]
49
+ },
50
+ "clean_up_tokenization_spaces": false,
51
  "cls_token": "[CLS]",
52
+ "extra_special_tokens": {},
53
  "mask_token": "[MASK]",
54
  "model_max_length": 1000000000000000019884624838656,
55
  "pad_token": "[PAD]",
 
57
  "tokenizer_class": "LCATokenizer",
58
  "unk_token": "[UNK]",
59
  "kmer": 6,
60
+ "shift": 1,
61
+ "operation_space": "kmer"
62
+ }
vocab.txt CHANGED
@@ -4099,4 +4099,3 @@ TTTTTA
4099
  TTTTTC
4100
  TTTTTG
4101
  TTTTTT
4102
- NNNNNN
 
4099
  TTTTTC
4100
  TTTTTG
4101
  TTTTTT