File size: 3,144 Bytes

76b4794

from __future__ import annotations
from .mecab_tokenizer import MeCabTokenizer
import os

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}


def save_stoi(stoi: dict[str, int], vocab_file: str):
    with open(vocab_file, "w", encoding="utf-8") as writer:
        index = 0
        for token, token_index in sorted(stoi.items(), key=lambda kv: kv[1]):
            if index != token_index:
                raise ValueError(
                    "Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                    " Please check that the vocabulary is not corrupted!")
            writer.write(token + "\n")
            index += 1


def load_stoi(vocab_file: str) -> dict[str, int]:
    stoi: dict[str, int] = {}
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")
        stoi[token] = index
    return stoi


class FastTextJpTokenizer(MeCabTokenizer):
    vocab_files_names = VOCAB_FILES_NAMES

    def __init__(self,
                 vocab_file: str,
                 hinshi: list[str] | None = None,
                 mecab_dicdir: str | None = None,
                 **kwargs):
        """初期化処理

        Args:
            vocab_file (str): vocab_fileのpath
            hinshi (list[str] | None, optional): 抽出する品詞
            mecab_dicdir (str | None, optional): dicrcのあるディレクトリ
        """
        super().__init__(hinshi, mecab_dicdir, **kwargs)

        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        self.stoi = load_stoi(vocab_file)
        self.itos = dict([(ids, tok) for tok, ids in self.stoi.items()])
        self.v_size = len(self.stoi)

        # self._auto_map = {
        #     "AutoTokenizer": ["modeling.FastTextMeCabTokenizer", None]
        # }
        # self.init_inputs = ["vocab.txt"]

    @property
    def vocab_size(self) -> int:
        """
        `int`: Size of the base vocabulary (without the added tokens).
        """
        return self.v_size

    def _convert_token_to_id(self, token: str) -> int:
        return self.stoi[token]

    def _convert_id_to_token(self, index: int) -> str:
        return self.itos[index]

    def save_vocabulary(self,
                        save_directory: str,
                        filename_prefix: str | None = None) -> tuple[str]:
        index = 0
        if os.path.isdir(save_directory):
            vocab_file = os.path.join(
                save_directory,
                (filename_prefix + "-" if filename_prefix else "") +
                "vocab.txt")
        else:
            vocab_file = (filename_prefix +
                          "-" if filename_prefix else "") + save_directory
        save_stoi(self.stoi, vocab_file)
        return (vocab_file, )


FastTextJpTokenizer.register_for_auto_class("AutoTokenizer")