SG1.0 / tokenizer_config.json
zeroMN's picture
Upload 6 files
ab2c44d verified
raw
history blame
871 Bytes
{
"tokenizer_name": "AutoTokenizer",
"pretrained_model_name": "AutoModel",
"vocab": {
"vocab_size": 30522,
"model_max_length": 512,
"padding_side": "right",
"truncation_side": "right",
"special_tokens": {
"pad_token": "[PAD]",
"unk_token": "[UNK]",
"cls_token": "[CLS]",
"sep_token": "[SEP]",
"mask_token": "[MASK]"
},
"tokenizer_type": "WordPiece",
"lowercase": true,
"pad_token_id": 0,
"unk_token_id": 100,
"cls_token_id": 101,
"sep_token_id": 102,
"mask_token_id": 103
},
"normalization": {
"lowercase": true,
"strip_accents": true
},
"preprocessing": {
"do_lower_case": true,
"handle_chinese_chars": true
}
}