fasttext-jp-embedding / fasttext_jp_tokenizer.py
Taizo Kaneko
commit files to HF hub
76b4794
raw
history blame
3.14 kB
from __future__ import annotations
from .mecab_tokenizer import MeCabTokenizer
import os
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
def save_stoi(stoi: dict[str, int], vocab_file: str):
with open(vocab_file, "w", encoding="utf-8") as writer:
index = 0
for token, token_index in sorted(stoi.items(), key=lambda kv: kv[1]):
if index != token_index:
raise ValueError(
"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!")
writer.write(token + "\n")
index += 1
def load_stoi(vocab_file: str) -> dict[str, int]:
stoi: dict[str, int] = {}
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
stoi[token] = index
return stoi
class FastTextJpTokenizer(MeCabTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
def __init__(self,
vocab_file: str,
hinshi: list[str] | None = None,
mecab_dicdir: str | None = None,
**kwargs):
"""初期化処理
Args:
vocab_file (str): vocab_fileのpath
hinshi (list[str] | None, optional): 抽出する品詞
mecab_dicdir (str | None, optional): dicrcのあるディレクトリ
"""
super().__init__(hinshi, mecab_dicdir, **kwargs)
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.stoi = load_stoi(vocab_file)
self.itos = dict([(ids, tok) for tok, ids in self.stoi.items()])
self.v_size = len(self.stoi)
# self._auto_map = {
# "AutoTokenizer": ["modeling.FastTextMeCabTokenizer", None]
# }
# self.init_inputs = ["vocab.txt"]
@property
def vocab_size(self) -> int:
"""
`int`: Size of the base vocabulary (without the added tokens).
"""
return self.v_size
def _convert_token_to_id(self, token: str) -> int:
return self.stoi[token]
def _convert_id_to_token(self, index: int) -> str:
return self.itos[index]
def save_vocabulary(self,
save_directory: str,
filename_prefix: str | None = None) -> tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") +
"vocab.txt")
else:
vocab_file = (filename_prefix +
"-" if filename_prefix else "") + save_directory
save_stoi(self.stoi, vocab_file)
return (vocab_file, )
FastTextJpTokenizer.register_for_auto_class("AutoTokenizer")