File size: 5,378 Bytes
0bdaf07
 
 
07b93fd
0bdaf07
e734bf8
0bdaf07
07b93fd
 
0bdaf07
 
07b93fd
0bdaf07
07b93fd
 
 
 
 
0bdaf07
 
34b994f
0bdaf07
07b93fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bdaf07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f893c7d
 
34b994f
 
 
 
 
 
 
 
 
 
0bdaf07
 
 
 
 
 
 
07b93fd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
---
library_name: transformers
license: apache-2.0
base_model: answerdotai/ModernBERT-base
model-index:
- name: KoModernBERT-base-mlm-v02-ckp02
  results: []
language:
- ko
---

# KoModernBERT-base-mlm-v02

This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) <br>

* Flash-Attention 2
* StabelAdamW
* Unpadding & Sequence Packing

It achieves the following results on the evaluation set:
- Loss: 1.6437

## Example Use
```python
from transformers import AutoTokenizer, AutoModelForMaskedLM
from huggingface_hub import HfApi, login
with open('./api_key/HGF_TOKEN.txt', 'r') as hgf:
    login(token=hgf.read())
api = HfApi()

model_id = "x2bee/KoModernBERT-base-mlm-v01"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id).to("cuda")

def modern_bert_convert_with_multiple_masks(text: str, top_k: int = 1, select_method:str = "Logit") -> str:
    if "[MASK]" not in text:
        raise ValueError("MLM Model should include '[MASK]' in the sentence")

    while "[MASK]" in text:
        inputs = tokenizer(text, return_tensors="pt").to("cuda")
        outputs = model(**inputs)

        input_ids = inputs["input_ids"][0].tolist()
        mask_indices = [i for i, token_id in enumerate(input_ids) if token_id == tokenizer.mask_token_id]

        current_mask_index = mask_indices[0]

        logits = outputs.logits[0, current_mask_index]

        top_k_tokens = logits.topk(top_k).indices.tolist()
        top_k_logits, top_k_indices = logits.topk(top_k)
        
        if select_method == "Logit":
            probabilities = torch.softmax(top_k_logits, dim=0).tolist()
            predicted_token_id = random.choices(top_k_indices.tolist(), weights=probabilities, k=1)[0]
            predicted_token = tokenizer.decode([predicted_token_id]).strip()
            
        elif select_method == "Random":
            predicted_token_id = random.choice(top_k_tokens)
            predicted_token = tokenizer.decode([predicted_token_id]).strip()
            
        elif select_method == "Best":
            predicted_token_id = top_k_tokens[0]
            predicted_token = tokenizer.decode([predicted_token_id]).strip()
            
        else:
            raise ValueError("select_method should be one of ['Logit', 'Random', 'Best']")

        text = text.replace("[MASK]", predicted_token, 1)

        print(f"Predicted: {predicted_token} | Current text: {text}")

    return text
```

```
text = "30์ผ ์ „๋‚จ ๋ฌด์•ˆ๊ตญ์ œ[MASK] ํ™œ์ฃผ๋กœ์— ์ „๋‚  ๋ฐœ์ƒํ•œ ์ œ์ฃผํ•ญ๊ณต [MASK] ๋‹น์‹œ ๊ธฐ์ฒด๊ฐ€ [MASK]์ฐฉ๋ฅ™ํ•˜๋ฉด์„œ ๊ฐ•ํ•œ ๋งˆ์ฐฐ๋กœ ์ƒ๊ธด ํ”์ ์ด ๋‚จ์•„ ์žˆ๋‹ค. ์ด ์ฐธ์‚ฌ๋กœ [MASK]๊ณผ ์Šน๋ฌด์› 181๋ช… ์ค‘ 179๋ช…์ด ์ˆจ์ง€๊ณ  [MASK]๋Š” ํ˜•์ฒด๋ฅผ ์•Œ์•„๋ณผ ์ˆ˜ ์—†์ด [MASK]๋๋‹ค. [MASK] ๊ทœ๋ชจ์™€ [MASK] ์›์ธ ๋“ฑ์— ๋Œ€ํ•ด ๋‹ค์–‘ํ•œ [MASK]์ด ์ œ๊ธฐ๋˜๊ณ  ์žˆ๋Š” ๊ฐ€์šด๋ฐ [MASK]์— ์„ค์น˜๋œ [MASK](์ฐฉ๋ฅ™ ์œ ๋„ ์•ˆ์ „์‹œ์„ค)๊ฐ€ [MASK]๋ฅผ ํ‚ค์› ๋‹ค๋Š” [MASK]์ด ๋‚˜์˜ค๊ณ  ์žˆ๋‹ค."
result = mbm.modern_bert_convert_with_multiple_masks(text, top_k=1)

'30์ผ ์ „๋‚จ ๋ฌด์•ˆ๊ตญ์ œํ„ฐ๋ฏธ๋„ ํ™œ์ฃผ๋กœ์— ์ „๋‚  ๋ฐœ์ƒํ•œ ์ œ์ฃผํ•ญ๊ณต ์‚ฌ๊ณ  ๋‹น์‹œ ๊ธฐ์ฒด๊ฐ€ ๋ฌด๋‹จ์ฐฉ๋ฅ™ํ•˜๋ฉด์„œ ๊ฐ•ํ•œ ๋งˆ์ฐฐ๋กœ ์ƒ๊ธด ํ”์ ์ด ๋‚จ์•„ ์žˆ๋‹ค. ์ด ์ฐธ์‚ฌ๋กœ ์Šน๊ฐ๊ณผ ์Šน๋ฌด์› 181๋ช… ์ค‘ 179๋ช…์ด ์ˆจ์ง€๊ณ  ์ผ๋ถ€๋Š” ํ˜•์ฒด๋ฅผ ์•Œ์•„๋ณผ ์ˆ˜ ์—†์ด ์‹ค์ข…๋๋‹ค. ์‚ฌ๊ณ  ๊ทœ๋ชจ์™€ ์‚ฌ๊ณ  ์›์ธ ๋“ฑ์— ๋Œ€ํ•ด ๋‹ค์–‘ํ•œ ์˜ํ˜น์ด ์ œ๊ธฐ๋˜๊ณ  ์žˆ๋Š” ๊ฐ€์šด๋ฐ ๊ธฐ๋‚ด์— ์„ค์น˜๋œ ESC(์ฐฉ๋ฅ™ ์œ ๋„ ์•ˆ์ „์‹œ์„ค)๊ฐ€ ์‚ฌ๊ณ ๋ฅผ ํ‚ค์› ๋‹ค๋Š” ์ฃผ์žฅ์ด ๋‚˜์˜ค๊ณ  ์žˆ๋‹ค.'
```

```
text = "์ค‘๊ตญ์˜ ์ˆ˜๋„๋Š” [MASK]์ด๋‹ค"
result = mbm.modern_bert_convert_with_multiple_masks(text, top_k=1)
'์ค‘๊ตญ์˜ ์ˆ˜๋„๋Š” ๋ฒ ์ด์ง•์ด๋‹ค'

text = "์ผ๋ณธ์˜ ์ˆ˜๋„๋Š” [MASK]์ด๋‹ค"
result = mbm.modern_bert_convert_with_multiple_masks(text, top_k=1)
'์ผ๋ณธ์˜ ์ˆ˜๋„๋Š” ๋„์ฟ„์ด๋‹ค'

text = "๋Œ€ํ•œ๋ฏผ๊ตญ์˜ ๊ฐ€์žฅ ํฐ ๋„์‹œ๋Š” [MASK]์ด๋‹ค"
result = mbm.modern_bert_convert_with_multiple_masks(text, top_k=1)
'๋Œ€ํ•œ๋ฏผ๊ตญ์˜ ๊ฐ€์žฅ ํฐ ๋„์‹œ๋Š” ์ธ์ฒœ์ด๋‹ค'
```

## Training procedure

### Training hyperparameters

The following hyperparameters were used during training:
- learning_rate: 1e-06
- train_batch_size: 8
- eval_batch_size: 8
- seed: 42
- distributed_type: multi-GPU
- num_devices: 8
- gradient_accumulation_steps: 8
- total_train_batch_size: 512
- total_eval_batch_size: 64
- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
- lr_scheduler_type: linear
- lr_scheduler_warmup_ratio: 0.1
- num_epochs: 1

### Training results

| Training Loss | Epoch  | Step  | Validation Loss |
|:-------------:|:------:|:-----:|:---------------:|
| 14.3633       | 0.0986 | 3000  | 1.7944          |
| 14.0205       | 0.1973 | 6000  | 1.7638          |
| 14.0391       | 0.2959 | 9000  | 1.7430          |
| 13.8014       | 0.3946 | 12000 | 1.7255          |
| 13.6803       | 0.4932 | 15000 | 1.7118          |
| 13.5763       | 0.5919 | 18000 | 1.6961          |
| 13.4827       | 0.6905 | 21000 | 1.6824          |
| 13.3855       | 0.7892 | 24000 | 1.6700          |
| 13.2238       | 0.8878 | 27000 | 1.6558          |
| 13.0954       | 0.9865 | 30000 | 1.6437          |


### Framework versions

- Transformers 4.48.0
- Pytorch 2.5.1+cu124
- Datasets 3.2.0
- Tokenizers 0.21.0