KoichiYasuoka commited on
Commit
c981f09
·
1 Parent(s): ae8d65b

model improved

Browse files
Files changed (6) hide show
  1. config.json +1 -1
  2. maker.py +14 -8
  3. oldtokenizer.json +0 -0
  4. pytorch_model.bin +1 -1
  5. tokenizer.json +0 -0
  6. ud.py +8 -2
config.json CHANGED
@@ -367,7 +367,7 @@
367
  "tie_word_embeddings": false,
368
  "tokenizer_class": "PreTrainedTokenizerFast",
369
  "torch_dtype": "float32",
370
- "transformers_version": "4.42.4",
371
  "use_cache": true,
372
  "use_parallel_residual": false,
373
  "vocab_size": 44416
 
367
  "tie_word_embeddings": false,
368
  "tokenizer_class": "PreTrainedTokenizerFast",
369
  "torch_dtype": "float32",
370
+ "transformers_version": "4.44.2",
371
  "use_cache": true,
372
  "use_parallel_residual": false,
373
  "vocab_size": 44416
maker.py CHANGED
@@ -3,7 +3,7 @@ src="rinna/japanese-gpt-neox-small"
3
  tgt="KoichiYasuoka/rinna-gpt-neox-small-japanese-ud-causal"
4
  url="https://github.com/UniversalDependencies/UD_Japanese-GSDLUW"
5
 
6
- import os,json
7
  from transformers import AutoTokenizer,PreTrainedTokenizerFast,AutoConfig,GPTNeoXForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
8
  d=os.path.basename(url)
9
  os.system("test -d "+d+" || git clone --depth=1 "+url)
@@ -11,6 +11,7 @@ os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done")
11
  tkz=AutoTokenizer.from_pretrained(src,add_prefix_space=False,legacy=False,model_max_length=2048)
12
  tkz.save_pretrained("tmpdir")
13
  d=json.loads(tkz.backend_tokenizer.to_str())
 
14
  form=set()
15
  with open("train.conllu","r",encoding="utf-8") as r:
16
  for s in r:
@@ -20,13 +21,17 @@ with open("train.conllu","r",encoding="utf-8") as r:
20
  for t in d["model"]["vocab"]:
21
  if t[0] not in form:
22
  t[1]*=len(t[0])
 
 
23
  tkz.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/tokenizer.json")
24
- tkz=PreTrainedTokenizerFast.from_pretrained("tmpdir")
 
25
 
26
  class UDCausalDataset(object):
27
- def __init__(self,conllu,tokenizer,embeddings=None):
28
  self.conllu=open(conllu,"r",encoding="utf-8")
29
  self.tokenizer=tokenizer
 
30
  self.embeddings=embeddings
31
  self.max_tokens=3
32
  self.seeks=[(0,0)]
@@ -71,8 +76,8 @@ class UDCausalDataset(object):
71
  if w[0].isdecimal():
72
  upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
73
  deps.append((int(w[6]),w[7]))
74
- v=self.tokenizer(form,add_special_tokens=False)
75
  if t==0:
 
76
  i,u=[],[]
77
  for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
78
  if x!=[]:
@@ -82,6 +87,7 @@ class UDCausalDataset(object):
82
  pad=self.tokenizer.pad_token_id
83
  else:
84
  import torch
 
85
  m=[]
86
  for x in v["input_ids"]:
87
  if x==[]:
@@ -109,9 +115,9 @@ class UDCausalDataset(object):
109
  upos=u[0:self.max_tokens]
110
  return {"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
111
 
112
- trainDS=UDCausalDataset("train.conllu",tkz)
113
- devDS=UDCausalDataset("dev.conllu",tkz)
114
- testDS=UDCausalDataset("test.conllu",tkz)
115
  lid=trainDS(devDS,testDS)
116
  cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
117
  mdl=GPTNeoXForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True)
@@ -121,4 +127,4 @@ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,dataload
121
  trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
122
  trn.train()
123
  trn.save_model(tgt)
124
- tkz.save_pretrained(tgt)
 
3
  tgt="KoichiYasuoka/rinna-gpt-neox-small-japanese-ud-causal"
4
  url="https://github.com/UniversalDependencies/UD_Japanese-GSDLUW"
5
 
6
+ import os,json,unicodedata
7
  from transformers import AutoTokenizer,PreTrainedTokenizerFast,AutoConfig,GPTNeoXForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
8
  d=os.path.basename(url)
9
  os.system("test -d "+d+" || git clone --depth=1 "+url)
 
11
  tkz=AutoTokenizer.from_pretrained(src,add_prefix_space=False,legacy=False,model_max_length=2048)
12
  tkz.save_pretrained("tmpdir")
13
  d=json.loads(tkz.backend_tokenizer.to_str())
14
+ tkz.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/oldtokenizer.json")
15
  form=set()
16
  with open("train.conllu","r",encoding="utf-8") as r:
17
  for s in r:
 
21
  for t in d["model"]["vocab"]:
22
  if t[0] not in form:
23
  t[1]*=len(t[0])
24
+ elif len(t[0])>1 and unicodedata.name(t[0][0]).startswith("HIRAGANA"):
25
+ t[1]*=len(t[0])
26
  tkz.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/tokenizer.json")
27
+ ntk=PreTrainedTokenizerFast.from_pretrained("tmpdir")
28
+ otk=PreTrainedTokenizerFast.from_pretrained("tmpdir",tokenizer_file="tmpdir/oldtokenizer.json")
29
 
30
  class UDCausalDataset(object):
31
+ def __init__(self,conllu,tokenizer,oldtokenizer=None,embeddings=None):
32
  self.conllu=open(conllu,"r",encoding="utf-8")
33
  self.tokenizer=tokenizer
34
+ self.oldtokenizer=oldtokenizer if oldtokenizer else tokenizer
35
  self.embeddings=embeddings
36
  self.max_tokens=3
37
  self.seeks=[(0,0)]
 
76
  if w[0].isdecimal():
77
  upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
78
  deps.append((int(w[6]),w[7]))
 
79
  if t==0:
80
+ v=self.tokenizer(form,add_special_tokens=False)
81
  i,u=[],[]
82
  for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
83
  if x!=[]:
 
87
  pad=self.tokenizer.pad_token_id
88
  else:
89
  import torch
90
+ v=self.oldtokenizer(form,add_special_tokens=False)
91
  m=[]
92
  for x in v["input_ids"]:
93
  if x==[]:
 
115
  upos=u[0:self.max_tokens]
116
  return {"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
117
 
118
+ trainDS=UDCausalDataset("train.conllu",ntk,otk)
119
+ devDS=UDCausalDataset("dev.conllu",ntk,otk)
120
+ testDS=UDCausalDataset("test.conllu",ntk,otk)
121
  lid=trainDS(devDS,testDS)
122
  cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
123
  mdl=GPTNeoXForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True)
 
127
  trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
128
  trn.train()
129
  trn.save_model(tgt)
130
+ ntk.save_pretrained(tgt)
oldtokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb8c94b610e3cb65a776ba947f8ad8edd6534aa050ebff4fbc5283b99e2980c7
3
  size 477227998
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fffdfbf75da7de6f3a3c543821983faf9a3efc331f88476a253d2f527f3094a
3
  size 477227998
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
ud.py CHANGED
@@ -1,5 +1,10 @@
1
  import numpy
2
- from transformers import TokenClassificationPipeline
 
 
 
 
 
3
 
4
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
5
  def __init__(self,**kwargs):
@@ -42,6 +47,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
42
  def __init__(self,**kwargs):
43
  kwargs["aggregation_strategy"]="simple"
44
  super().__init__(**kwargs)
 
45
  x=self.model.config.label2id
46
  self.root=numpy.full((len(x)),numpy.nan)
47
  self.left_arc=numpy.full((len(x)),numpy.nan)
@@ -87,7 +93,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
87
  if d[i].strip()=="":
88
  d.pop(i)
89
  w.pop(i)
90
- v=self.tokenizer(d,add_special_tokens=False)
91
  e=self.model.get_input_embeddings().weight
92
  m=[]
93
  for x in v["input_ids"]:
 
1
  import numpy
2
+ from transformers import TokenClassificationPipeline,AutoTokenizer
3
+ try:
4
+ from transformers.utils import cached_file
5
+ except:
6
+ from transformers.file_utils import cached_path,hf_bucket_url
7
+ cached_file=lambda x,y:os.path.join(x,y) if os.path.isdir(x) else cached_path(hf_bucket_url(x,y))
8
 
9
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
10
  def __init__(self,**kwargs):
 
47
  def __init__(self,**kwargs):
48
  kwargs["aggregation_strategy"]="simple"
49
  super().__init__(**kwargs)
50
+ self.oldtokenizer=AutoTokenizer.from_pretrained(self.tokenizer.name_or_path,tokenizer_file=cached_file(self.tokenizer.name_or_path,"oldtokenizer.json"))
51
  x=self.model.config.label2id
52
  self.root=numpy.full((len(x)),numpy.nan)
53
  self.left_arc=numpy.full((len(x)),numpy.nan)
 
93
  if d[i].strip()=="":
94
  d.pop(i)
95
  w.pop(i)
96
+ v=self.oldtokenizer(d,add_special_tokens=False)
97
  e=self.model.get_input_embeddings().weight
98
  m=[]
99
  for x in v["input_ids"]: