bbb / tokenizer.json
Lakoc's picture
Model save
ee2dd41
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "</s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "<mask>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Replace",
"pattern": {
"String": "``"
},
"content": "\""
},
{
"type": "Replace",
"pattern": {
"String": "''"
},
"content": "\""
},
{
"type": "Lowercase"
}
]
},
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<s>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "</s>",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "<s>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "</s>",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<s>",
"type_id": 1
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "</s>",
"type_id": 1
}
}
],
"special_tokens": {
"</s>": {
"id": "</s>",
"ids": [
1
],
"tokens": [
"</s>"
]
},
"<s>": {
"id": "<s>",
"ids": [
0
],
"tokens": [
"<s>"
]
}
}
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<s>": 0,
"</s>": 1,
"<unk>": 2,
"<pad>": 3,
"<mask>": 4,
"#": 5,
"$": 6,
"%": 7,
"&": 8,
"'": 9,
"*": 10,
"+": 11,
"0": 12,
"1": 13,
"2": 14,
"3": 15,
"4": 16,
"5": 17,
"6": 18,
"7": 19,
"8": 20,
"9": 21,
"<": 22,
"=": 23,
">": 24,
"@": 25,
"[": 26,
"\\": 27,
"]": 28,
"^": 29,
"a": 30,
"b": 31,
"c": 32,
"d": 33,
"e": 34,
"f": 35,
"g": 36,
"h": 37,
"i": 38,
"j": 39,
"k": 40,
"l": 41,
"m": 42,
"n": 43,
"o": 44,
"p": 45,
"q": 46,
"r": 47,
"s": 48,
"t": 49,
"u": 50,
"v": 51,
"w": 52,
"x": 53,
"y": 54,
"z": 55,
"Ċ": 56,
"Ġ": 57,
"Ġt": 58,
"Ġa": 59,
"he": 60,
"in": 61,
"re": 62,
"Ġs": 63,
"Ġthe": 64,
"Ġo": 65,
"Ġw": 66,
"er": 67,
"nd": 68,
"Ġc": 69,
"at": 70,
"Ġb": 71,
"it": 72,
"ou": 73,
"Ġf": 74,
"on": 75,
"Ġm": 76,
"en": 77,
"Ġp": 78,
"is": 79,
"ing": 80,
"Ġto": 81,
"es": 82,
"Ġth": 83,
"Ġh": 84,
"or": 85,
"an": 86,
"Ġd": 87,
"ed": 88,
"Ġof": 89,
"Ġand": 90,
"Ġin": 91,
"al": 92,
"ar": 93,
"Ġl": 94,
"as": 95,
"Ġn": 96,
"om": 97,
"ic": 98,
"Ġg": 99,
"Ġe": 100,
"le": 101,
"ve": 102,
"Ġbe": 103,
"ll": 104,
"ion": 105,
"ly": 106,
"ent": 107,
"ot": 108,
"ut": 109,
"us": 110,
"Ġre": 111,
"Ġi": 112,
"Ġon": 113,
"Ġthat": 114,
"Ġis": 115,
"Ġy": 116,
"ow": 117,
"et": 118,
"id": 119,
"Ġit": 120,
"ac": 121,
"ay": 122,
"Ġha": 123,
"ld": 124,
"the": 125,
"Ġhe": 126,
"Ġwe": 127,
"ro": 128,
"ver": 129,
"Ġst": 130,
"ig": 131,
"Ġyou": 132,
"st": 133,
"Ġfor": 134,
"Ġwh": 135,
"ir": 136,
"se": 137,
"ur": 138,
"im": 139,
"ct": 140,
"am": 141,
"ith": 142,
"and": 143,
"ke": 144,
"ad": 145,
"un": 146,
"Ġan": 147,
"'s": 148,
"Ġare": 149,
"Ġwith": 150,
"Ġas": 151,
"Ġr": 152,
"Ġwas": 153,
"ch": 154,
"Ġse": 155,
"th": 156,
"oo": 157,
"ce": 158,
"if": 159,
"Ġmo": 160,
"ht": 161,
"op": 162,
"ill": 163,
"Ġhave": 164,
"ation": 165,
"ers": 166,
"Ġne": 167,
"il": 168,
"ol": 169,
"Ġnot": 170,
"Ġde": 171,
"Ġcom": 172,
"Ġfr": 173,
"ter": 174,
"red": 175,
"ul": 176,
"Ġdo": 177,
"Ġch": 178,
"ge": 179,
"ive": 180,
"ould": 181,
"ight": 182,
"Ġone": 183,
"Ġj": 184,
"Ġat": 185,
"Ġor": 186,
"Ġcon": 187,
"Ġthey": 188,
"Ġsa": 189,
"Ġk": 190,
"ant": 191,
"ess": 192,
"Ġpro": 193,
"ous": 194,
"pe": 195,
"Ġcan": 196,
"Ġex": 197,
"Ġab": 198,
"Ġthis": 199,
"ain": 200,
"Ġu": 201,
"Ġsh": 202,
"Ġwor": 203,
"est": 204,
"ate": 205,
"Ġtr": 206,
"Ġv": 207,
"out": 208,
"ist": 209,
"ab": 210,
"ust": 211,
"ome": 212,
"Ġsu": 213,
"Ġme": 214,
"Ġtw": 215,
"ally": 216,
"Ġso": 217,
"Ġal": 218,
"ies": 219,
"pp": 220,
"her": 221,
"Ġgo": 222,
"Ġbut": 223,
"Ġli": 224,
"Ġwhe": 225,
"Ġfrom": 226,
"ity": 227,
"Ġsaid": 228,
"qu": 229,
"um": 230,
"our": 231,
"art": 232,
"'t": 233,
"Ġus": 234,
"res": 235,
"very": 236,
"ie": 237,
"all": 238,
"Ġall": 239,
"one": 240,
"Ġpl": 241,
"ine": 242,
"os": 243,
"ear": 244,
"ort": 245,
"Ġpe": 246,
"we": 247,
"oug": 248,
"nt": 249,
"ment": 250,
"Ġabout": 251,
"ap": 252,
"Ġby": 253,
"Ġout": 254,
"king": 255,
"ople": 256,
"Ġmore": 257,
"Ġtheir": 258,
"iv": 259,
"Ġwill": 260,
"el": 261,
"ast": 262,
"Ġwho": 263,
"Ġma": 264,
"Ġte": 265,
"Ġhas": 266,
"Ġint": 267,
"Ġpeople": 268,
"ard": 269,
"Ġup": 270,
"Ġmy": 271,
"Ġle": 272,
"Ġwhat": 273,
"Ġthere": 274,
"ack": 275,
"ound": 276,
"ind": 277,
"fe": 278,
"und": 279,
"ty": 280,
"Ġtim": 281,
"Ġun": 282,
"ther": 283,
"mp": 284,
"Ġar": 285,
"Ġthem": 286,
"Ġen": 287,
"Ġlike": 288,
"Ġtwo": 289,
"but": 290,
"ci": 291,
"ven": 292,
"ure": 293,
"ood": 294,
"now": 295,
"ays": 296,
"ong": 297,
"ag": 298,
"Ġyear": 299,
"ich": 300,
"Ġhad": 301,
"so": 302,
"Ġwould": 303,
"Ġget": 304,
"ud": 305,
"Ġwere": 306,
"ough": 307,
"Ġfe": 308,
"Ġyour": 309,
"act": 310,
"Ġhis": 311,
"Ġgr": 312,
"ousand": 313,
"Ġthousand": 314,
"Ġpr": 315,
"Ġsome": 316,
"Ġbec": 317,
"are": 318,
"Ġman": 319,
"undred": 320,
"Ġour": 321,
"Ġhundred": 322,
"ame": 323,
"Ġjust": 324,
"em": 325,
"Ġcl": 326,
"ide": 327,
"ree": 328,
"Ġcomp": 329,
"Ġwhen": 330,
"ks": 331,
"Ġshe": 332,
"Ġco": 333,
"use": 334,
"Ġtime": 335,
"Ġbr": 336,
"ans": 337,
"age": 338,
"Ġthan": 339,
"ib": 340,
"Ġsp": 341,
"ace": 342,
"Ġper": 343,
"Ġno": 344,
"Ġif": 345,
"Ġwhich": 346,
"reat": 347,
"Ġother": 348,
"ions": 349,
"Ġad": 350,
"ore": 351,
"Ġbeen": 352,
"Ġlo": 353,
"Ġta": 354,
"Ġknow": 355,
"Ġam": 356,
"Ġqu": 357,
"ice": 358,
"hing": 359,
"Ġnew": 360,
"ip": 361,
"Ġpo": 362,
"ause": 363,
"ink": 364,
"per": 365,
"ry": 366,
"Ġhow": 367,
"for": 368,
"ild": 369,
"Ġher": 370,
"te": 371,
"able": 372,
"Ġany": 373,
"Ġdis": 374,
"Ġag": 375,
"og": 376,
"ose": 377,
"ings": 378,
"Ġwant": 379,
"iz": 380,
"Ġinto": 381,
"Ġro": 382,
"ite": 383,
"ult": 384,
"Ġaf": 385,
"Ġac": 386,
"Ġpart": 387,
"ical": 388,
"od": 389,
"ence": 390,
"Ġres": 391,
"Ġevery": 392,
"Ġcould": 393,
"Ġover": 394,
"ook": 395,
"Ġvery": 396,
"ount": 397,
"Ġimp": 398,
"ix": 399,
"Ġcont": 400,
"ren": 401,
"ass": 402,
"Ġdr": 403,
"pl": 404,
"Ġbecause": 405,
"Ġwork": 406,
"own": 407,
"ated": 408,
"ck": 409,
"ue": 410,
"Ġoff": 411,
"Ġthink": 412,
"Ġspe": 413,
"Ġsc": 414,
"ance": 415,
"Ġneed": 416,
"Ġyears": 417,
"ach": 418,
"Ġway": 419,
"int": 420,
"pt": 421,
"Ġnow": 422,
"vel": 423,
"Ġgoing": 424,
"nder": 425,
"Ġbl": 426,
"Ġsee": 427,
"'re": 428,
"ect": 429,
"ress": 430,
"Ġonly": 431,
"enty": 432,
"Ġworld": 433,
"Ġmake": 434,
"ang": 435,
"Ġback": 436,
"ep": 437,
"Ġdon": 438,
"uch": 439,
"ations": 440,
"Ġthree": 441,
"ire": 442,
"ia": 443,
"Ġdid": 444,
"Ġalso": 445,
"sel": 446,
"Ġcr": 447,
"Ġdif": 448,
"itt": 449,
"Ġstr": 450,
"omet": 451,
"Ġthese": 452,
"ile": 453,
"Ġbet": 454,
"Ġmost": 455,
"Ġact": 456,
"Ġfir": 457,
"ens": 458,
"other": 459,
"wh": 460,
"ign": 461,
"ents": 462,
"ves": 463,
"they": 464,
"Ġpre": 465,
"ater": 466,
"ving": 467,
"Ġwhere": 468,
"Ġfl": 469,
"Ġthr": 470,
"ife": 471,
"Ġfive": 472,
"ub": 473,
"Ġinc": 474,
"Ġmuch": 475,
"ick": 476,
"Ġgood": 477,
"ors": 478,
"ial": 479,
"Ġnine": 480,
"ber": 481,
"Ġeven": 482,
"cess": 483,
"this": 484,
"du": 485,
"Ġmar": 486,
"cent": 487,
"ug": 488,
"Ġhim": 489,
"Ġfirst": 490,
"wn": 491,
"ory": 492,
"Ġthen": 493,
"Ġsix": 494,
"Ġcomm": 495,
"ade": 496,
"Ġits": 497,
"ak": 498,
"Ġsay": 499
},
"merges": [
"Ġ t",
"Ġ a",
"h e",
"i n",
"r e",
"Ġ s",
"Ġt he",
"Ġ o",
"Ġ w",
"e r",
"n d",
"Ġ c",
"a t",
"Ġ b",
"i t",
"o u",
"Ġ f",
"o n",
"Ġ m",
"e n",
"Ġ p",
"i s",
"in g",
"Ġt o",
"e s",
"Ġt h",
"Ġ h",
"o r",
"a n",
"Ġ d",
"e d",
"Ġo f",
"Ġa nd",
"Ġ in",
"a l",
"a r",
"Ġ l",
"a s",
"Ġ n",
"o m",
"i c",
"Ġ g",
"Ġ e",
"l e",
"v e",
"Ġb e",
"l l",
"i on",
"l y",
"en t",
"o t",
"u t",
"u s",
"Ġ re",
"Ġ i",
"Ġo n",
"Ġth at",
"Ġ is",
"Ġ y",
"o w",
"e t",
"i d",
"Ġ it",
"a c",
"a y",
"Ġh a",
"l d",
"t he",
"Ġ he",
"Ġw e",
"r o",
"v er",
"Ġs t",
"i g",
"Ġy ou",
"s t",
"Ġf or",
"Ġw h",
"i r",
"s e",
"u r",
"i m",
"c t",
"a m",
"it h",
"a nd",
"k e",
"a d",
"u n",
"Ġa n",
"' s",
"Ġa re",
"Ġw ith",
"Ġa s",
"Ġ r",
"Ġw as",
"c h",
"Ġs e",
"t h",
"o o",
"c e",
"i f",
"Ġm o",
"h t",
"o p",
"i ll",
"Ġha ve",
"at ion",
"er s",
"Ġn e",
"i l",
"o l",
"Ġn ot",
"Ġd e",
"Ġc om",
"Ġf r",
"t er",
"re d",
"u l",
"Ġd o",
"Ġc h",
"g e",
"i ve",
"ou ld",
"ig ht",
"Ġon e",
"Ġ j",
"Ġa t",
"Ġo r",
"Ġc on",
"Ġthe y",
"Ġs a",
"Ġ k",
"an t",
"es s",
"Ġp ro",
"ou s",
"p e",
"Ġc an",
"Ġe x",
"Ġa b",
"Ġth is",
"a in",
"Ġ u",
"Ġs h",
"Ġw or",
"es t",
"at e",
"Ġt r",
"Ġ v",
"ou t",
"is t",
"a b",
"us t",
"om e",
"Ġs u",
"Ġm e",
"Ġt w",
"al ly",
"Ġs o",
"Ġa l",
"i es",
"p p",
"he r",
"Ġg o",
"Ġb ut",
"Ġl i",
"Ġw he",
"Ġfr om",
"it y",
"Ġsa id",
"q u",
"u m",
"ou r",
"ar t",
"' t",
"Ġ us",
"re s",
"ver y",
"i e",
"al l",
"Ġa ll",
"on e",
"Ġp l",
"in e",
"o s",
"e ar",
"or t",
"Ġp e",
"w e",
"ou g",
"n t",
"m ent",
"Ġab out",
"a p",
"Ġb y",
"Ġo ut",
"k ing",
"op le",
"Ġmo re",
"Ġthe ir",
"i v",
"Ġw ill",
"e l",
"as t",
"Ġwh o",
"Ġm a",
"Ġt e",
"Ġh as",
"Ġin t",
"Ġpe ople",
"ar d",
"Ġu p",
"Ġm y",
"Ġl e",
"Ġwh at",
"Ġthe re",
"ac k",
"ou nd",
"in d",
"f e",
"u nd",
"t y",
"Ġt im",
"Ġ un",
"the r",
"m p",
"Ġa r",
"Ġthe m",
"Ġ en",
"Ġli ke",
"Ġtw o",
"b ut",
"c i",
"v en",
"u re",
"oo d",
"n ow",
"ay s",
"on g",
"a g",
"Ġy ear",
"ic h",
"Ġha d",
"s o",
"Ġw ould",
"Ġg et",
"u d",
"Ġwe re",
"oug h",
"Ġf e",
"Ġyou r",
"ac t",
"Ġh is",
"Ġg r",
"ous and",
"Ġth ousand",
"Ġp r",
"Ġs ome",
"Ġbe c",
"a re",
"Ġm an",
"und red",
"Ġo ur",
"Ġh undred",
"am e",
"Ġj ust",
"e m",
"Ġc l",
"id e",
"re e",
"Ġcom p",
"Ġwhe n",
"k s",
"Ġs he",
"Ġc o",
"us e",
"Ġtim e",
"Ġb r",
"an s",
"a ge",
"Ġth an",
"i b",
"Ġs p",
"ac e",
"Ġp er",
"Ġn o",
"Ġi f",
"Ġwh ich",
"re at",
"Ġo ther",
"ion s",
"Ġa d",
"o re",
"Ġbe en",
"Ġl o",
"Ġt a",
"Ġk now",
"Ġa m",
"Ġ qu",
"ic e",
"h ing",
"Ġne w",
"i p",
"Ġp o",
"a use",
"in k",
"p er",
"r y",
"Ġh ow",
"f or",
"i ld",
"Ġhe r",
"t e",
"ab le",
"Ġan y",
"Ġd is",
"Ġa g",
"o g",
"o se",
"ing s",
"Ġw ant",
"i z",
"Ġint o",
"Ġ ro",
"it e",
"ul t",
"Ġa f",
"Ġa c",
"Ġp art",
"ic al",
"o d",
"en ce",
"Ġre s",
"Ġe very",
"Ġc ould",
"Ġo ver",
"oo k",
"Ġ very",
"ou nt",
"Ġi mp",
"i x",
"Ġcon t",
"re n",
"as s",
"Ġd r",
"p l",
"Ġbec ause",
"Ġwor k",
"ow n",
"at ed",
"c k",
"u e",
"Ġof f",
"Ġth ink",
"Ġs pe",
"Ġs c",
"an ce",
"Ġne ed",
"Ġyear s",
"ac h",
"Ġw ay",
"in t",
"p t",
"Ġn ow",
"ve l",
"Ġgo ing",
"nd er",
"Ġb l",
"Ġse e",
"' re",
"e ct",
"res s",
"Ġon ly",
"ent y",
"Ġwor ld",
"Ġma ke",
"an g",
"Ġb ack",
"e p",
"Ġd on",
"u ch",
"ation s",
"Ġth ree",
"i re",
"i a",
"Ġd id",
"Ġal so",
"se l",
"Ġc r",
"Ġd if",
"it t",
"Ġst r",
"om et",
"Ġthe se",
"i le",
"Ġbe t",
"Ġmo st",
"Ġa ct",
"Ġf ir",
"en s",
"ot her",
"w h",
"ig n",
"ent s",
"v es",
"the y",
"Ġp re",
"at er",
"v ing",
"Ġwhe re",
"Ġf l",
"Ġth r",
"if e",
"Ġf ive",
"u b",
"Ġin c",
"Ġm uch",
"ic k",
"Ġg ood",
"or s",
"i al",
"Ġn ine",
"b er",
"Ġe ven",
"c ess",
"th is",
"d u",
"Ġm ar",
"c ent",
"u g",
"Ġh im",
"Ġfir st",
"w n",
"or y",
"Ġthe n",
"Ġs ix",
"Ġcom m",
"ad e",
"Ġit s",
"a k",
"Ġs ay"
]
}
}