austindavis commited on
Commit
f3eb8cb
·
verified ·
1 Parent(s): 0dd6796

Uploaded Tokenizer Config Files

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +3 -0
  2. tokenizer.json +124 -0
  3. tokenizer_config.json +7 -0
special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "bos_token": ";"
3
+ }
tokenizer.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 9,
8
+ "content": ";",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ }
15
+ ],
16
+ "normalizer": null,
17
+ "pre_tokenizer": {
18
+ "type": "Split",
19
+ "pattern": {
20
+ "Regex": "."
21
+ },
22
+ "behavior": "Isolated",
23
+ "invert": false
24
+ },
25
+ "post_processor": {
26
+ "type": "TemplateProcessing",
27
+ "single": [
28
+ {
29
+ "SpecialToken": {
30
+ "id": ";",
31
+ "type_id": 0
32
+ }
33
+ },
34
+ {
35
+ "Sequence": {
36
+ "id": "A",
37
+ "type_id": 0
38
+ }
39
+ }
40
+ ],
41
+ "pair": [
42
+ {
43
+ "Sequence": {
44
+ "id": "A",
45
+ "type_id": 0
46
+ }
47
+ },
48
+ {
49
+ "Sequence": {
50
+ "id": "B",
51
+ "type_id": 1
52
+ }
53
+ }
54
+ ],
55
+ "special_tokens": {
56
+ "": {
57
+ "id": "",
58
+ "ids": [
59
+ 0
60
+ ],
61
+ "tokens": [
62
+ ""
63
+ ]
64
+ },
65
+ ";": {
66
+ "id": ";",
67
+ "ids": [
68
+ 9
69
+ ],
70
+ "tokens": [
71
+ ";"
72
+ ]
73
+ }
74
+ }
75
+ },
76
+ "decoder": {
77
+ "type": "Sequence",
78
+ "decoders": [
79
+ {
80
+ "type": "Replace",
81
+ "pattern": {
82
+ "String": " "
83
+ },
84
+ "content": "▁"
85
+ },
86
+ {
87
+ "type": "Replace",
88
+ "pattern": {
89
+ "String": "▁"
90
+ },
91
+ "content": " "
92
+ }
93
+ ]
94
+ },
95
+ "model": {
96
+ "type": "WordLevel",
97
+ "vocab": {
98
+ " ": 0,
99
+ "1": 1,
100
+ "2": 2,
101
+ "3": 3,
102
+ "4": 4,
103
+ "5": 5,
104
+ "6": 6,
105
+ "7": 7,
106
+ "8": 8,
107
+ ";": 9,
108
+ "#": 10,
109
+ "a": 11,
110
+ "b": 12,
111
+ "c": 13,
112
+ "d": 14,
113
+ "e": 15,
114
+ "f": 16,
115
+ "g": 17,
116
+ "h": 18,
117
+ "n": 19,
118
+ "r": 20,
119
+ "q": 21,
120
+ "k": 22
121
+ },
122
+ "unk_token": " "
123
+ }
124
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "bos_token": ";",
4
+ "clean_up_tokenization_spaces": true,
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "tokenizer_class": "UciTokenizer"
7
+ }