achandlr commited on
Commit
2e1aa5e
verified
1 Parent(s): 978c16f

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +69 -5
  2. tokenizer_config.json +6 -2
tokenizer.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
@@ -2329,10 +2334,69 @@
2329
  ]
2330
  },
2331
  "post_processor": {
2332
- "type": "ByteLevel",
2333
- "add_prefix_space": true,
2334
- "trim_offsets": false,
2335
- "use_regex": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2336
  },
2337
  "decoder": {
2338
  "type": "ByteLevel",
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 4096,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
2334
  ]
2335
  },
2336
  "post_processor": {
2337
+ "type": "Sequence",
2338
+ "processors": [
2339
+ {
2340
+ "type": "ByteLevel",
2341
+ "add_prefix_space": true,
2342
+ "trim_offsets": false,
2343
+ "use_regex": true
2344
+ },
2345
+ {
2346
+ "type": "TemplateProcessing",
2347
+ "single": [
2348
+ {
2349
+ "SpecialToken": {
2350
+ "id": "<|begin_of_text|>",
2351
+ "type_id": 0
2352
+ }
2353
+ },
2354
+ {
2355
+ "Sequence": {
2356
+ "id": "A",
2357
+ "type_id": 0
2358
+ }
2359
+ }
2360
+ ],
2361
+ "pair": [
2362
+ {
2363
+ "SpecialToken": {
2364
+ "id": "<|begin_of_text|>",
2365
+ "type_id": 0
2366
+ }
2367
+ },
2368
+ {
2369
+ "Sequence": {
2370
+ "id": "A",
2371
+ "type_id": 0
2372
+ }
2373
+ },
2374
+ {
2375
+ "SpecialToken": {
2376
+ "id": "<|begin_of_text|>",
2377
+ "type_id": 1
2378
+ }
2379
+ },
2380
+ {
2381
+ "Sequence": {
2382
+ "id": "B",
2383
+ "type_id": 1
2384
+ }
2385
+ }
2386
+ ],
2387
+ "special_tokens": {
2388
+ "<|begin_of_text|>": {
2389
+ "id": "<|begin_of_text|>",
2390
+ "ids": [
2391
+ 128000
2392
+ ],
2393
+ "tokens": [
2394
+ "<|begin_of_text|>"
2395
+ ]
2396
+ }
2397
+ }
2398
+ }
2399
+ ]
2400
  },
2401
  "decoder": {
2402
  "type": "ByteLevel",
tokenizer_config.json CHANGED
@@ -2050,14 +2050,18 @@
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
2053
- "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
2054
  "clean_up_tokenization_spaces": true,
2055
  "eos_token": "<|end_of_text|>",
 
2056
  "model_input_names": [
2057
  "input_ids",
2058
  "attention_mask"
2059
  ],
2060
  "model_max_length": 1000000000000000019884624838656,
2061
  "pad_token": "<|end_of_text|>",
2062
- "tokenizer_class": "PreTrainedTokenizerFast"
 
 
 
2063
  }
 
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
2053
+ "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
2054
  "clean_up_tokenization_spaces": true,
2055
  "eos_token": "<|end_of_text|>",
2056
+ "max_length": 4096,
2057
  "model_input_names": [
2058
  "input_ids",
2059
  "attention_mask"
2060
  ],
2061
  "model_max_length": 1000000000000000019884624838656,
2062
  "pad_token": "<|end_of_text|>",
2063
+ "stride": 0,
2064
+ "tokenizer_class": "PreTrainedTokenizerFast",
2065
+ "truncation_side": "right",
2066
+ "truncation_strategy": "longest_first"
2067
  }