neuralbioinfo
/

prokbert-mini

+import collections
+import os
+import json
+from copy import deepcopy
+from typing import List, Optional, Tuple, Dict, Set
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging
+from itertools import product
+logger = logging.get_logger(__name__)
+#from .config_utils import SeqConfig
+#from .sequtils import generate_kmers, lca_kmer_tokenize_segment
+# Define the names of the vocabulary files
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+# Define the mapping for pretrained vocabulary files
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "lca-mini-k6s1": "lca-base-dna6/vocab.txt",
+        "lca-mini-k6s2": "lca-base-dna6/vocab.txt",
+        "lca-mini-k1s1": "lca-base-dna1/vocab.txt",
+    }
+}
+# Define positional embedding sizes for pretrained models
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "lca-mini-k6s1": 1024,
+    "lca-mini-k1s1": 1024,
+    "lca-mini-k6s2": 2048,
+}
+# Define initial configuration for pretrained models
+PRETRAINED_INIT_CONFIGURATION = {
+    "lca-mini-k6s1": {"do_upper_case": True},
+    "lca-mini-k1s1": {"do_upper_case": True},
+    "lca-mini-k6s2": {"do_upper_case": True},
+}
+def generate_kmers(abc: Set[str], k: int) -> List[str]:
+    """
+    Generates all possible k-mers from a given alphabet.
+    :param abc: The alphabet.
+    :type abc: Set[str]
+    :param k: Length of the k-mers.
+    :type k: int
+    :return: List of all possible k-mers.
+    :rtype: List[str]
+    """
+    return [''.join(p) for p in product(abc, repeat=k)]
+# Utility function to load vocabulary from a file
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        vocab[token.rstrip("\n")] = index
+    return vocab
+def resolve_vocab_file(vocab_file: Optional[str], kmer) -> str:
+    """
+    Resolves the path to the vocabulary file. If not provided, tries to load it
+    from the installed prokbert package or download it from the GitHub repository.
+    Args:
+        vocab_file (str, optional): Path to the vocabulary file.
+    Returns:
+        str: Path to the resolved vocabulary file.
+    Raises:
+        FileNotFoundError: If the vocabulary file cannot be resolved.
+    """
+    if vocab_file and os.path.exists(vocab_file):
+        return vocab_file
+    # Attempt 1: Check if prokbert is installed
+    try:
+        import prokbert
+        package_dir = os.path.dirname(prokbert.__file__)
+        vocab_path = os.path.join(package_dir, 'data/prokbert_vocabs/', f'prokbert-base-dna{kmer}', 'vocab.txt')
+        print(vocab_path)
+        #vocabfile_path = join(self.current_path, 'data/prokbert_vocabs/', f'prokbert-base-dna{act_kmer}', 'vocab.txt')
+        if os.path.exists(vocab_path):
+            logger.info(f"Loaded vocab file from installed prokbert package: {vocab_path}")
+            return vocab_path
+    except ImportError:
+        logger.info("Prokbert package not installed, proceeding to download vocab.txt.")
+    # Attempt 2: Download from GitHub repository
+    github_url = "https://raw.githubusercontent.com/username/prokbert/main/vocab.txt"
+    temp_vocab_path = os.path.join(os.getcwd(), "vocab.txt")
+    try:
+        import requests
+        response = requests.get(github_url, timeout=10)
+        response.raise_for_status()  # Raise an error for HTTP failures
+        with open(temp_vocab_path, "w", encoding="utf-8") as f:
+            f.write(response.text)
+        logger.info(f"Downloaded vocab.txt from GitHub to: {temp_vocab_path}")
+        return temp_vocab_path
+    except requests.RequestException as e:
+        raise FileNotFoundError(
+            "Could not find or download vocab.txt. Ensure prokbert is installed or "
+            "provide a valid vocab file path. Error: {e}"
+        ) from e
+class LCATokenizer(PreTrainedTokenizer):
+    """
+    Custom tokenizer for LCA (Local Context Aware) tasks.
+    Handles specific tokenization processes, including k-mer tokenization with configurable shifts.
+    Attributes:
+        vocab_files_names (dict): Mapping of vocabulary file names.
+        pretrained_vocab_files_map (dict): Mapping of pretrained vocabulary files.
+        pretrained_init_configuration (dict): Initial configuration for pretrained models.
+        max_model_input_sizes (dict): Maximum input sizes for pretrained models.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    nucleotide_abc = {"A", "T", "C", "G"}
+    extended_nucleotide_abc = {"A", "T", "C", "G", "*"}
+    sequence_unk_token = 'N'
+    default_unk_token = "[UNK]"
+    default_sep_token = "[SEP]"
+    default_pad_token = "[PAD]"
+    default_cls_token = "[CLS]"
+    default_mask_token = "[MASK]"
+    vocab_files_names = {"vocab_file": "vocab.txt"}
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        kmer: int = 6,
+        shift: int = 1,
+        operation_space: str = "kmer",
+        **kwargs,
+    ):
+        """
+        Initializes the LCATokenizer.
+        Args:
+            vocab_file (str): Path to the vocabulary file.
+            kmer (int): K-mer size for tokenization.
+            shift (int): Shift size for tokenization.
+            operation_space (str): Defines operation mode ('kmer' or 'sequence').
+            kwargs: Additional arguments for PreTrainedTokenizer.
+        """
+        # Load vocabulary directly from the vocab file
+        self.config = {}
+        resolved_vocab_file = resolve_vocab_file(vocab_file, kmer)
+        self.vocab = load_vocab(resolved_vocab_file)
+        #self.vocab = load_vocab(vocab_file)
+        self.id2token = {v: k for k, v in self.vocab.items()}
+        self.kmer = kmer
+        self.shift = shift
+        self.operation_space = operation_space
+        self.config["kmer"] = kmer
+        self.config["shift"] = shift
+        self.config["operation_space"] = operation_space
+        # Special tokens
+        kwargs.setdefault("cls_token", "[CLS]")
+        kwargs.setdefault("sep_token", "[SEP]")
+        kwargs.setdefault("pad_token", "[PAD]")
+        kwargs.setdefault("unk_token", "[UNK]")
+        kwargs.setdefault("mask_token", "[MASK]")
+        self.special_tokens = [kwargs["cls_token"], kwargs["sep_token"], kwargs["pad_token"], kwargs["unk_token"], kwargs["mask_token"]]
+        super().__init__(**kwargs)
+        if self.operation_space == 'sequence':
+            token_extension = sorted(list(set(generate_kmers(LCATokenizer.extended_nucleotide_abc, self.config['kmer'])) - \
+                 set(generate_kmers(LCATokenizer.nucleotide_abc, self.config['kmer'])) ))
+            self.extended_vocab = deepcopy(self.vocab)
+            for token in token_extension:
+                self.extended_vocab[token] = 4
+            self.unk_token = LCATokenizer.sequence_unk_token * self.config['shift']
+            self.mask_token = '*'
+            self.extended_vocab[self.mask_token] = self.vocab['[MASK]']
+            full_unk = 'N' * self.config['kmer']
+            self.vocab[full_unk] = 1
+            self.id2token[1] = full_unk
+            self.full_unk_token = full_unk
+        else:
+            self.extended_vocab = self.vocab
+            self.unk_token = '[UNK]'
+        self.unkown_tokenid = self.vocab['[UNK]']
+        self.sep_token = '[SEP]'
+        self.cls_token = '[CLS]'
+        self.pad_token = '[PAD]'
+        self.mask_token = '[MASK]'
+        self.special_tokens = list(self.special_tokens_map.values())
+    def get_vocab(self) -> Dict[str, int]:
+        return self.vocab
+    def _tokenize(self, text, **kwargs):
+        """
+        Tokenizes the input text using LCA tokenization with an optional offset.
+        Args:
+            text (str): The input DNA sequence to tokenize.
+            kwargs: Additional arguments, including:
+                - offset (int): The starting position for tokenization. Default is 0.
+        Returns:
+            List[str]: A list of tokens generated from the input text.
+        """
+        offset = kwargs.get("offset", 0)
+        #if offset < 0 or offset >= self.config.get("shift", 1):
+        #    raise ValueError(f"Invalid offset: {offset}. Must be between 0 and {self.config['shift'] - 1}.")
+        return self.lca_kmer_tokenize_segment(text, offset)
+    def _convert_token_to_id(self, token: str) -> int:
+        """
+        Converts a token to its corresponding ID using the vocabulary.
+        Args:
+            token (str): The token to convert.
+        Returns:
+            int: Token ID, or the unknown token ID if the token is not in the vocabulary.
+        """
+        return self.extended_vocab.get(token, self.unkown_tokenid)
+    def _convert_id_to_token(self, index: int) -> str:
+        """
+        Converts an ID to its corresponding token using the vocabulary.
+        Args:
+            index (int): The ID to convert.
+        Returns:
+            str: Corresponding token, or the unknown token if the ID is not in the vocabulary.
+        """
+        return self.id2token.get(index, self.unk_token)
+    def __len__(self) -> int:
+        """
+        Returns the length of the tokenizer's vocabulary.
+        The length returned is one less than the actual number of items in the vocabulary
+        to account for a specific offset or adjustment in token indexing.
+        :return: The adjusted length of the vocabulary.
+        :rtype: int
+        """
+        return len(self.vocab)
+    def lca_kmer_tokenize_segment(self, segment: str, offset: int):
+        # calculate the tokenization for one offset value
+        shift = self.shift
+        kmer = self.kmer
+        #max_segment_length = params['max_segment_length']
+        #max_unknown_token_proportion = params['max_unknown_token_proportion']
+        #kmer = params['kmer']
+        #token_limit = params['token_limit']
+        #vocabmap = params['vocabmap']
+        #add_special_token = params['add_special_token']
+        #if len(segment) > max_segment_length:
+        #    raise(ValueError(f'The segment is longer {len(segment)} then the maximum allowed segment length ({max_segment_length}). '))
+        kmers = [segment[i:i + kmer] for i in range(offset, len(segment) - kmer + 1, shift)]
+        return kmers
+    def tokenize(self, text: str, **kwargs) -> List[str]:
+        """
+        Tokenizes the input text using LCA tokenization.
+        Args:
+            text (str): The input DNA sequence to tokenize.
+            kwargs: Additional arguments, including:
+                - offset (int): The starting position for tokenization. Default is 0.
+        Returns:
+            List[str]: A list of tokens generated from the input text.
+        """
+        return self._tokenize(text, **kwargs)
+    def encode(self, text: str,  **kwargs) -> List[int]:
+        """
+        Extends the base `encode` method to support an `offset` parameter for custom tokenization logic.
+        Args:
+            text (str): Input text (DNA sequence).
+            offset (int): Offset parameter for the LCA tokenization. Defaults to 0.
+            kwargs: Additional arguments passed to the base `encode` method.
+        Returns:
+            List[int]: Encoded token IDs.
+        """
+        # Inject the offset into kwargs for the tokenizer
+        offset = kwargs.get("offset", 0)
+        kwargs["offset"] = offset
+        return super().encode(text, **kwargs)
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Builds inputs by adding special tokens to a sequence or pair of sequences.
+        Args:
+            token_ids_0 (List[int]): List of token IDs for the first sequence.
+            token_ids_1 (List[int], optional): List of token IDs for the second sequence.
+        Returns:
+            List[int]: Input IDs with special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        input_ids = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id]
+        #token_type_ids = [0 for i in range(len(input_ids))]
+        return input_ids
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create the token type IDs corresponding to the sequences passed. [What are token type
+        IDs?](../glossary#token-type-ids)
+        Should be overridden in a subclass if the model has a special way of building those.
+        Args:
+            token_ids_0 (`List[int]`): The first tokenized sequence.
+            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
+        Returns:
+            `List[int]`: The token type ids.
+        """
+        if token_ids_1 is None:
+            return (len(token_ids_0)+2) * [0]
+        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
+    def batch_encode_plus(self, *args, **kwargs):
+        """
+        Extends the base `batch_encode_plus` method to add custom functionality if needed.
+        Args:
+            *args: Positional arguments passed to the base method.
+            **kwargs: Keyword arguments passed to the base method.
+        Returns:
+            dict: A dictionary containing the results of batch encoding.
+        """
+        # Call the parent method to handle the batch encoding
+        #print('Running batch encoding with ids')
+        act_outputs = super().batch_encode_plus(*args, **kwargs)
+        return act_outputs
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Saves the tokenizer's vocabulary to a file.
+        Args:
+            save_directory (str): Directory to save the vocabulary file.
+            filename_prefix (str, optional): Prefix for the filename. Default is None.
+        Returns:
+            Tuple[str]: Path to the saved vocabulary file.
+        """
+        if filename_prefix is None:
+            filename_prefix = ""
+        vocab_file_path = os.path.join(save_directory, filename_prefix + "vocab.txt")
+        with open(vocab_file_path, "w") as f:
+            for token in self.vocab:
+                f.write(token + "\n")
+        return (vocab_file_path,)
+    @property
+    def vocab_size(self) -> int:
+        """
+        Returns the size of the vocabulary (number of tokens in `vocab.txt`).
+        Returns:
+            int: The size of the vocabulary.
+        """
+        return len(self.vocab)
+    def save_pretrained(self, save_directory: str, **kwargs):
+        """
+        Save the tokenizer configuration and vocabulary to a directory.
+        Args:
+            save_directory (str): Directory to save the tokenizer files.
+            kwargs: Additional arguments for saving.
+        """
+        if not os.path.exists(save_directory):
+            os.makedirs(save_directory)
+        # Save the base tokenizer configuration
+        super().save_pretrained(save_directory, **kwargs)
+        # Path to the tokenizer configuration file
+        tokenizer_config_path = os.path.join(save_directory, "tokenizer_config.json")
+        # Load the existing configuration or create a new one
+        if os.path.exists(tokenizer_config_path):
+            with open(tokenizer_config_path, "r", encoding="utf-8") as f:
+                tokenizer_config = json.load(f)
+        else:
+            tokenizer_config = {}
+        # Add custom fields for AutoTokenizer and remote code
+        #tokenizer_config["auto_map"] = {
+        # "AutoTokenizer": "src.prokbert.tokenizer.LCATokenizer"
+        #}
+        #tokenizer_config["repository"] = "https://github.com/nbrg-ppcu/prokbert"
+        #tokenizer_config["trust_remote_code"] = True
+        tokenizer_config["kmer"] = self.kmer
+        tokenizer_config["shift"] = self.shift
+        tokenizer_config["operation_space"] = self.operation_space
+        # Save the updated configuration
+        with open(tokenizer_config_path, "w", encoding="utf-8") as f:
+            json.dump(tokenizer_config, f, indent=2)

tokenizer_config.json CHANGED Viewed

@@ -1,6 +1,55 @@
 {
-  "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
@@ -8,5 +57,6 @@
   "tokenizer_class": "LCATokenizer",
   "unk_token": "[UNK]",
   "kmer": 6,
-  "shift": 1
-}

 {
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizer.LCATokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",
+  "extra_special_tokens": {},
   "mask_token": "[MASK]",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
   "tokenizer_class": "LCATokenizer",
   "unk_token": "[UNK]",
   "kmer": 6,
+  "shift": 1,
+  "operation_space": "kmer"
+}

vocab.txt CHANGED Viewed

@@ -4099,4 +4099,3 @@ TTTTTA
 TTTTTC
 TTTTTG
 TTTTTT
-NNNNNN

 TTTTTC
 TTTTTG
 TTTTTT