thinh111 commited on
Commit
4929447
Β·
verified Β·
1 Parent(s): 387df4a

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +121 -117
model.py CHANGED
@@ -1,118 +1,122 @@
1
- from unsloth import FastLanguageModel
2
- import torch
3
- max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
4
- dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
5
- load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
6
-
7
- from langchain_community.llms import CTransformers
8
- from langchain.chains import RetrievalQA
9
- from langchain.prompts import PromptTemplate
10
- from langchain_community.embeddings import GPT4AllEmbeddings
11
- from langchain_community.vectorstores import FAISS
12
- from langchain_community.llms import HuggingFacePipeline
13
- from langchain.callbacks.base import BaseCallbackHandler
14
- from transformers import pipeline
15
-
16
- # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
17
- fourbit_models = [
18
- "unsloth/mistral-7b-bnb-4bit",
19
- "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
20
- "unsloth/llama-2-7b-bnb-4bit",
21
- "unsloth/llama-2-13b-bnb-4bit",
22
- "unsloth/codellama-34b-bnb-4bit",
23
- "unsloth/tinyllama-bnb-4bit",
24
- "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
25
- "unsloth/gemma-2b-bnb-4bit",
26
- ] # More models at https://huggingface.co/unsloth
27
-
28
- template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
29
-
30
- ### Instruction:
31
- You are ResVuAssist and You are a helpful bot who reads texts and answers questions about them.
32
-
33
- ### Input:
34
- {context}
35
- QUESTION: {question}
36
-
37
- ### Response:
38
- """
39
-
40
- # Cau hinh
41
- vector_db_path = "vectorstores/db_faiss"
42
-
43
-
44
- def initialModelAndTokenizer():
45
- model, tokenizer = FastLanguageModel.from_pretrained(
46
- model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
47
- max_seq_length = max_seq_length,
48
- dtype = dtype,
49
- load_in_4bit = load_in_4bit,
50
- # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
51
- )
52
- model = FastLanguageModel.get_peft_model(
53
- model,
54
- r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
55
- target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
56
- "gate_proj", "up_proj", "down_proj",],
57
- lora_alpha = 16,
58
- lora_dropout = 0, # Supports any, but = 0 is optimized
59
- bias = "none", # Supports any, but = "none" is optimized
60
- # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
61
- use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
62
- random_state = 3407,
63
- use_rslora = False, # We support rank stabilized LoRA
64
- loftq_config = None, # And LoftQ
65
- )
66
- return model, tokenizer
67
-
68
- def create_pipeline():
69
- model, tokenizer = initialModelAndTokenizer()
70
- pipe = pipeline(
71
- "text-generation",
72
- model=model,
73
- tokenizer=tokenizer,
74
- max_new_tokens=512,
75
- temperature=0.1,
76
- top_p=0.95,
77
- repetition_penalty=1.15
78
- )
79
- return pipe
80
-
81
- # Tao prompt template
82
- def creat_prompt(template):
83
- prompt = PromptTemplate(template = template, input_variables=["context", "question"])
84
- return prompt
85
-
86
- # Tao simple chain
87
- def create_qa_chain(prompt, llm, db):
88
- llm_chain = RetrievalQA.from_chain_type(
89
- llm = llm,
90
- chain_type= "stuff",
91
- # retriever = db.as_retriever(search_kwargs = {"k":8}, max_tokens_limit=1024),
92
- retriever = db.as_retriever(search_kwargs = {"k": 15}, max_tokens_limit=4096),
93
- return_source_documents = False,
94
- chain_type_kwargs= {'prompt': prompt},
95
- )
96
- return llm_chain
97
-
98
- # Read tu VectorDB
99
- def read_vectors_db():
100
- # Embeding
101
- embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf")
102
- db = FAISS.load_local(vector_db_path, embedding_model, allow_dangerous_deserialization=True)
103
- return db
104
-
105
- def get_response_value(text):
106
- start = text.find('### Response:')
107
- if start != -1:
108
- return text[start + len('### Response:'):].strip()
109
- return None
110
-
111
- def llm_chain_response():
112
- pipe = create_pipeline()
113
- db = read_vectors_db()
114
- prompt = creat_prompt(template)
115
- llm = HuggingFacePipeline(pipeline=pipe)
116
-
117
- llm_chain =create_qa_chain(prompt, llm, db)
 
 
 
 
118
  return llm_chain
 
1
+ pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
2
+ pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
3
+
4
+
5
+ from unsloth import FastLanguageModel
6
+ import torch
7
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
8
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
9
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
10
+
11
+ from langchain_community.llms import CTransformers
12
+ from langchain.chains import RetrievalQA
13
+ from langchain.prompts import PromptTemplate
14
+ from langchain_community.embeddings import GPT4AllEmbeddings
15
+ from langchain_community.vectorstores import FAISS
16
+ from langchain_community.llms import HuggingFacePipeline
17
+ from langchain.callbacks.base import BaseCallbackHandler
18
+ from transformers import pipeline
19
+
20
+ # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
21
+ fourbit_models = [
22
+ "unsloth/mistral-7b-bnb-4bit",
23
+ "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
24
+ "unsloth/llama-2-7b-bnb-4bit",
25
+ "unsloth/llama-2-13b-bnb-4bit",
26
+ "unsloth/codellama-34b-bnb-4bit",
27
+ "unsloth/tinyllama-bnb-4bit",
28
+ "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
29
+ "unsloth/gemma-2b-bnb-4bit",
30
+ ] # More models at https://huggingface.co/unsloth
31
+
32
+ template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
33
+
34
+ ### Instruction:
35
+ You are ResVuAssist and You are a helpful bot who reads texts and answers questions about them.
36
+
37
+ ### Input:
38
+ {context}
39
+ QUESTION: {question}
40
+
41
+ ### Response:
42
+ """
43
+
44
+ # Cau hinh
45
+ vector_db_path = "vectorstores/db_faiss"
46
+
47
+
48
+ def initialModelAndTokenizer():
49
+ model, tokenizer = FastLanguageModel.from_pretrained(
50
+ model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
51
+ max_seq_length = max_seq_length,
52
+ dtype = dtype,
53
+ load_in_4bit = load_in_4bit,
54
+ # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
55
+ )
56
+ model = FastLanguageModel.get_peft_model(
57
+ model,
58
+ r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
59
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
60
+ "gate_proj", "up_proj", "down_proj",],
61
+ lora_alpha = 16,
62
+ lora_dropout = 0, # Supports any, but = 0 is optimized
63
+ bias = "none", # Supports any, but = "none" is optimized
64
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
65
+ use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
66
+ random_state = 3407,
67
+ use_rslora = False, # We support rank stabilized LoRA
68
+ loftq_config = None, # And LoftQ
69
+ )
70
+ return model, tokenizer
71
+
72
+ def create_pipeline():
73
+ model, tokenizer = initialModelAndTokenizer()
74
+ pipe = pipeline(
75
+ "text-generation",
76
+ model=model,
77
+ tokenizer=tokenizer,
78
+ max_new_tokens=512,
79
+ temperature=0.1,
80
+ top_p=0.95,
81
+ repetition_penalty=1.15
82
+ )
83
+ return pipe
84
+
85
+ # Tao prompt template
86
+ def creat_prompt(template):
87
+ prompt = PromptTemplate(template = template, input_variables=["context", "question"])
88
+ return prompt
89
+
90
+ # Tao simple chain
91
+ def create_qa_chain(prompt, llm, db):
92
+ llm_chain = RetrievalQA.from_chain_type(
93
+ llm = llm,
94
+ chain_type= "stuff",
95
+ # retriever = db.as_retriever(search_kwargs = {"k":8}, max_tokens_limit=1024),
96
+ retriever = db.as_retriever(search_kwargs = {"k": 15}, max_tokens_limit=4096),
97
+ return_source_documents = False,
98
+ chain_type_kwargs= {'prompt': prompt},
99
+ )
100
+ return llm_chain
101
+
102
+ # Read tu VectorDB
103
+ def read_vectors_db():
104
+ # Embeding
105
+ embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf")
106
+ db = FAISS.load_local(vector_db_path, embedding_model, allow_dangerous_deserialization=True)
107
+ return db
108
+
109
+ def get_response_value(text):
110
+ start = text.find('### Response:')
111
+ if start != -1:
112
+ return text[start + len('### Response:'):].strip()
113
+ return None
114
+
115
+ def llm_chain_response():
116
+ pipe = create_pipeline()
117
+ db = read_vectors_db()
118
+ prompt = creat_prompt(template)
119
+ llm = HuggingFacePipeline(pipeline=pipe)
120
+
121
+ llm_chain =create_qa_chain(prompt, llm, db)
122
  return llm_chain