Taizo Kaneko commited on
Commit
76b4794
·
1 Parent(s): 0bb74b8

commit files to HF hub

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ vocab.txt filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "FastTextJpModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "fasttext_jp_embedding.FastTextJpConfig",
7
+ "AutoModel": "fasttext_jp_embedding.FastTextJpModel"
8
+ },
9
+ "hidden_size": 300,
10
+ "model_type": "fast_text_jp",
11
+ "torch_dtype": "float32",
12
+ "transformers_version": "4.23.1",
13
+ "vocab_size": 10000
14
+ }
fasttext_jp_embedding.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from transformers import PretrainedConfig
3
+ from transformers import PreTrainedModel
4
+ from torch import nn
5
+ import torch
6
+
7
+
8
+ class FastTextJpConfig(PretrainedConfig):
9
+ model_type = "fast_text_jp"
10
+
11
+ def __init__(self, **kwargs):
12
+ super().__init__(**kwargs)
13
+
14
+
15
+ class FastTextJpModel(PreTrainedModel):
16
+ """FastTextのEmbeddingを行います。
17
+ """
18
+ config_class = FastTextJpConfig
19
+
20
+ def __init__(self, config: FastTextJpConfig):
21
+ super().__init__(config)
22
+ self.word_embeddings = nn.Embedding(config.vocab_size,
23
+ config.hidden_size)
24
+
25
+ def forward(self, input_ids, **kwargs):
26
+ return self.word_embeddings(torch.tensor([0]))
27
+
28
+
29
+ FastTextJpConfig.register_for_auto_class()
30
+ FastTextJpModel.register_for_auto_class("AutoModel")
fasttext_jp_tokenizer.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from .mecab_tokenizer import MeCabTokenizer
3
+ import os
4
+
5
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
6
+
7
+
8
+ def save_stoi(stoi: dict[str, int], vocab_file: str):
9
+ with open(vocab_file, "w", encoding="utf-8") as writer:
10
+ index = 0
11
+ for token, token_index in sorted(stoi.items(), key=lambda kv: kv[1]):
12
+ if index != token_index:
13
+ raise ValueError(
14
+ "Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
15
+ " Please check that the vocabulary is not corrupted!")
16
+ writer.write(token + "\n")
17
+ index += 1
18
+
19
+
20
+ def load_stoi(vocab_file: str) -> dict[str, int]:
21
+ stoi: dict[str, int] = {}
22
+ with open(vocab_file, "r", encoding="utf-8") as reader:
23
+ tokens = reader.readlines()
24
+ for index, token in enumerate(tokens):
25
+ token = token.rstrip("\n")
26
+ stoi[token] = index
27
+ return stoi
28
+
29
+
30
+ class FastTextJpTokenizer(MeCabTokenizer):
31
+ vocab_files_names = VOCAB_FILES_NAMES
32
+
33
+ def __init__(self,
34
+ vocab_file: str,
35
+ hinshi: list[str] | None = None,
36
+ mecab_dicdir: str | None = None,
37
+ **kwargs):
38
+ """初期化処理
39
+
40
+ Args:
41
+ vocab_file (str): vocab_fileのpath
42
+ hinshi (list[str] | None, optional): 抽出する品詞
43
+ mecab_dicdir (str | None, optional): dicrcのあるディレクトリ
44
+ """
45
+ super().__init__(hinshi, mecab_dicdir, **kwargs)
46
+
47
+ if not os.path.isfile(vocab_file):
48
+ raise ValueError(
49
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
50
+ " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
51
+ )
52
+ self.stoi = load_stoi(vocab_file)
53
+ self.itos = dict([(ids, tok) for tok, ids in self.stoi.items()])
54
+ self.v_size = len(self.stoi)
55
+
56
+ # self._auto_map = {
57
+ # "AutoTokenizer": ["modeling.FastTextMeCabTokenizer", None]
58
+ # }
59
+ # self.init_inputs = ["vocab.txt"]
60
+
61
+ @property
62
+ def vocab_size(self) -> int:
63
+ """
64
+ `int`: Size of the base vocabulary (without the added tokens).
65
+ """
66
+ return self.v_size
67
+
68
+ def _convert_token_to_id(self, token: str) -> int:
69
+ return self.stoi[token]
70
+
71
+ def _convert_id_to_token(self, index: int) -> str:
72
+ return self.itos[index]
73
+
74
+ def save_vocabulary(self,
75
+ save_directory: str,
76
+ filename_prefix: str | None = None) -> tuple[str]:
77
+ index = 0
78
+ if os.path.isdir(save_directory):
79
+ vocab_file = os.path.join(
80
+ save_directory,
81
+ (filename_prefix + "-" if filename_prefix else "") +
82
+ "vocab.txt")
83
+ else:
84
+ vocab_file = (filename_prefix +
85
+ "-" if filename_prefix else "") + save_directory
86
+ save_stoi(self.stoi, vocab_file)
87
+ return (vocab_file, )
88
+
89
+
90
+ FastTextJpTokenizer.register_for_auto_class("AutoTokenizer")
mecab_tokenizer.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from typing import NamedTuple
3
+ import MeCab
4
+ from transformers import PreTrainedTokenizer
5
+
6
+
7
+ class MeCabResult(NamedTuple):
8
+ hyosokei: str
9
+ hinshi: str
10
+ hinshi_saibunrui_1: str
11
+ hinshi_saibunrui_2: str
12
+ hinshi_saibunrui_3: str
13
+ katsuyokei_1: str
14
+ katsuyokei_2: str
15
+ genkei: str
16
+ yomi: str
17
+ hatsuon: str
18
+
19
+
20
+ class MeCabTokenizer(PreTrainedTokenizer):
21
+
22
+ def __init__(self,
23
+ hinshi: list[str] | None = None,
24
+ mecab_dicdir: str | None = None,
25
+ **kwargs):
26
+ """初期化処理
27
+
28
+ Args:
29
+ hinshi (list[str] | None): 抽出する品詞
30
+ mecab_dicdir (str | None, optional): dicrcのあるディレクトリ
31
+ """
32
+
33
+ self.target_hinshi = hinshi
34
+ if mecab_dicdir is not None:
35
+ self.mecab = MeCab.Tagger(f"-d {mecab_dicdir}")
36
+ else:
37
+ self.mecab = MeCab.Tagger()
38
+
39
+ super().__init__(**kwargs)
40
+
41
+ def _tokenize(self, text: str) -> list[str]:
42
+ """文章から特定の品詞の単語を返します。
43
+
44
+ Args:
45
+ text (str): 文章
46
+
47
+ Returns:
48
+ list[str]: 特定の品詞の単語
49
+ """
50
+
51
+ out = []
52
+ # Mecabで分析します。
53
+ result_words = self.mecab_analyze(text)
54
+ for result_word in result_words:
55
+ # 最初と最後は空文字
56
+ if result_word.hyosokei == "":
57
+ continue
58
+ if self.target_hinshi is not None and result_word.hinshi in self.target_hinshi:
59
+ # 特定の品詞のみ返します。
60
+ out.append(result_word.hyosokei)
61
+ else:
62
+ out.append(result_word.hyosokei)
63
+ return out
64
+
65
+ def mecab_analyze(self, text: str) -> list[MeCabResult]:
66
+ """文章をMecabで分析します。
67
+
68
+ Args:
69
+ text (str): 文章
70
+
71
+ Returns:
72
+ list[MeCabResult]: MeCabの解析結果
73
+ """
74
+ node = self.mecab.parseToNode(text)
75
+ #形態素1つ1つを処理
76
+ out = []
77
+ while node:
78
+ args = []
79
+ args.append(node.surface)
80
+ feature = node.feature.split(",")
81
+ args.extend(feature)
82
+ mecab_result = MeCabResult(args[0], args[1], args[2], args[3],
83
+ args[4], args[5], args[6], args[7],
84
+ args[8], args[9])
85
+ out.append(mecab_result)
86
+ node = node.next # 最後のEOSを省く
87
+ return out
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16c44d91478fe733c856779a82ff9a9da10fd8da41f594b4088b0c3d3a783003
3
+ size 12000829
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "fasttext_jp_tokenizer.FastTextJpTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "tokenizer_class": "FastTextJpTokenizer"
9
+ }
vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a1770ed0a47f44e882afc3f56271a16bc8dba675f18dd61e2cffac276b49acc
3
+ size 29910902