IlyaGusev commited on
Commit
0665cb9
·
1 Parent(s): b426553

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ adapter_model.bin filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.model filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,133 @@
1
  ---
2
- license: cc-by-2.0
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ datasets:
3
+ - IlyaGusev/ru_turbo_alpaca
4
+ - IlyaGusev/ru_sharegpt_cleaned
5
+ - IlyaGusev/oasst1_ru_main_branch
6
+ - lksy/ru_instruct_gpt4
7
+ - IlyaGusev/gpt_roleplay_realm
8
+ language:
9
+ - ru
10
+ pipeline_tag: conversational
11
+ license: cc-by-4.0
12
  ---
13
+
14
+ # Saiga2 70B, Russian LLaMA2-based chatbot
15
+
16
+ Based on [LLaMA-2 70B fp16](https://huggingface.co/TheBloke/Llama-2-70B-fp16)
17
+
18
+ This is an adapter-only version.
19
+
20
+ Training code: [link](https://github.com/IlyaGusev/rulm/tree/master/self_instruct)
21
+
22
+ **WARNING**: Avoid using V100 (in Colab, for example). Outputs are much worse in this case.
23
+
24
+ ```python
25
+ from peft import PeftModel, PeftConfig
26
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
27
+
28
+ MODEL_NAME = "IlyaGusev/saiga2_70b_lora"
29
+ DEFAULT_MESSAGE_TEMPLATE = "<s>{role}\n{content}</s>\n"
30
+ DEFAULT_SYSTEM_PROMPT = "Ты — Сайга, русскоязычный автоматический ассистент. Ты разговариваешь с людьми и помогаешь им."
31
+
32
+ class Conversation:
33
+ def __init__(
34
+ self,
35
+ message_template=DEFAULT_MESSAGE_TEMPLATE,
36
+ system_prompt=DEFAULT_SYSTEM_PROMPT,
37
+ start_token_id=1,
38
+ bot_token_id=9225
39
+ ):
40
+ self.message_template = message_template
41
+ self.start_token_id = start_token_id
42
+ self.bot_token_id = bot_token_id
43
+ self.messages = [{
44
+ "role": "system",
45
+ "content": system_prompt
46
+ }]
47
+
48
+ def get_start_token_id(self):
49
+ return self.start_token_id
50
+
51
+ def get_bot_token_id(self):
52
+ return self.bot_token_id
53
+
54
+ def add_user_message(self, message):
55
+ self.messages.append({
56
+ "role": "user",
57
+ "content": message
58
+ })
59
+
60
+ def add_bot_message(self, message):
61
+ self.messages.append({
62
+ "role": "bot",
63
+ "content": message
64
+ })
65
+
66
+ def get_prompt(self, tokenizer):
67
+ final_text = ""
68
+ for message in self.messages:
69
+ message_text = self.message_template.format(**message)
70
+ final_text += message_text
71
+ final_text += tokenizer.decode([self.start_token_id, self.bot_token_id])
72
+ return final_text.strip()
73
+
74
+
75
+ def generate(model, tokenizer, prompt, generation_config):
76
+ data = tokenizer(prompt, return_tensors="pt")
77
+ data = {k: v.to(model.device) for k, v in data.items()}
78
+ output_ids = model.generate(
79
+ **data,
80
+ generation_config=generation_config
81
+ )[0]
82
+ output_ids = output_ids[len(data["input_ids"][0]):]
83
+ output = tokenizer.decode(output_ids, skip_special_tokens=True)
84
+ return output.strip()
85
+
86
+ config = PeftConfig.from_pretrained(MODEL_NAME)
87
+ model = AutoModelForCausalLM.from_pretrained(
88
+ config.base_model_name_or_path,
89
+ load_in_8bit=True,
90
+ torch_dtype=torch.float16,
91
+ device_map="auto"
92
+ )
93
+ model = PeftModel.from_pretrained(
94
+ model,
95
+ MODEL_NAME,
96
+ torch_dtype=torch.float16
97
+ )
98
+ model.eval()
99
+
100
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
101
+ generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
102
+ print(generation_config)
103
+
104
+ inputs = ["Почему трава зеленая?", "Сочини длинный рассказ, обязательно упоминая следующие объекты. Дано: Таня, мяч"]
105
+ for inp in inputs:
106
+ conversation = Conversation()
107
+ conversation.add_user_message(inp)
108
+ prompt = conversation.get_prompt(tokenizer)
109
+
110
+ output = generate(model, tokenizer, prompt, generation_config)
111
+ print(inp)
112
+ print(output)
113
+ print()
114
+ print("==============================")
115
+ print()
116
+ ```
117
+
118
+ Examples:
119
+ ```
120
+ User: Почему трава зеленая?
121
+ Saiga:
122
+ ```
123
+
124
+ ```
125
+ User: Сочини длинный рассказ, обязательно упоминая следующие объекты. Дано: Таня, мяч
126
+ Saiga:
127
+ ```
128
+
129
+ v1:
130
+ - dataset code revision 0dbd022613874fcda915f588f4a3292e137017d2
131
+ - wandb [link](https://wandb.ai/ilyagusev/rulm_self_instruct/runs/4wp1y5jx)
132
+ - 5 datasets: ru_turbo_alpaca, ru_sharegpt_cleaned, oasst1_ru_main_branch, gpt_roleplay_realm, ru_instruct_gpt4
133
+ - Datasets merging script: [create_chat_set.py](https://github.com/IlyaGusev/rulm/blob/e4238fd9a196405b566a2d5838ab44b7a0f4dc31/self_instruct/src/data_processing/create_short_chat_set.py)
adapter_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "models/llama2-70b",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 16,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj",
19
+ "k_proj",
20
+ "o_proj"
21
+ ],
22
+ "task_type": "CAUSAL_LM"
23
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dead3782b2210b8e3166e3978a19523476302fa13f937abb4da186aaba018e4
3
+ size 262375757
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pad_token_id": 0,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "temperature": 0.2,
6
+ "top_p": 0.9,
7
+ "top_k": 30,
8
+ "do_sample": true,
9
+ "max_new_tokens": 1536,
10
+ "num_beams": 1,
11
+ "repetition_penalty": 1.15,
12
+ "no_repeat_ngram_size": 15
13
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<unk>",
5
+ "sep_token": "<s>",
6
+ "unk_token": "<unk>"
7
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "legacy": null,
22
+ "model_max_length": 4096,
23
+ "pad_token": null,
24
+ "padding_side": "left",
25
+ "sp_model_kwargs": {},
26
+ "spaces_between_special_tokens": false,
27
+ "tokenizer_class": "LlamaTokenizer",
28
+ "unk_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ },
36
+ "use_default_system_prompt": true
37
+ }