mannadamay12 commited on
Commit
988c5f2
·
verified ·
1 Parent(s): ca64dfe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -39
app.py CHANGED
@@ -1,44 +1,51 @@
1
- import os
2
  import spaces
 
 
3
  import torch
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
- # import torch
6
- # from transformers import (
7
- # AutoTokenizer,
8
- # TextStreamer,
9
- # pipeline,
10
- # BitsAndBytesConfig,
11
- # AutoModelForCausalLM
12
- # )
13
  from langchain.embeddings import HuggingFaceInstructEmbeddings
14
  from langchain.vectorstores import Chroma
15
  from langchain.prompts import PromptTemplate
16
  from langchain.chains import RetrievalQA
17
  from langchain.llms import HuggingFacePipeline
18
- import gradio as gr
19
 
20
- DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
21
- model_id = "meta-llama/Llama-3.2-3B-Instruct"
 
 
 
22
 
23
- # Remove the spaces.GPU decorator since we'll handle GPU directly
24
- # def initialize_model():
25
- # bnb_config = BitsAndBytesConfig(
26
- # load_in_4bit=True,
27
- # bnb_4bit_use_double_quant=True,
28
- # bnb_4bit_quant_type="nf4",
29
- # bnb_4bit_compute_dtype=torch.bfloat16
30
- # )
31
-
32
- # tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))
33
- # model = AutoModelForCausalLM.from_pretrained(
34
- # model_id,
35
- # token=os.environ.get("HF_TOKEN"),
36
- # quantization_config=bnb_config if torch.cuda.is_available() else None,
37
- # device_map="auto" if torch.cuda.is_available() else "cpu",
38
- # torch_dtype=torch.float32 if not torch.cuda.is_available() else None
39
- # )
40
-
41
- # return model, tokenizer
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def initialize_model():
44
  model_id = "meta-llama/Llama-3.2-3B-Instruct"
@@ -48,11 +55,11 @@ def initialize_model():
48
  model = AutoModelForCausalLM.from_pretrained(
49
  model_id,
50
  token=token,
51
- device_map="auto" # This works better with ZeroGPU
52
  )
53
 
54
  return model, tokenizer
55
-
56
  @spaces.GPU
57
  def respond(message, history, system_message, max_tokens, temperature, top_p):
58
  try:
@@ -81,12 +88,12 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
81
  )
82
 
83
  response = qa_chain.invoke({"query": message})
84
- return response["result"]
85
 
86
  except Exception as e:
87
- return f"An error occurred: {str(e)}"
88
-
89
 
 
90
  demo = gr.ChatInterface(
91
  respond,
92
  additional_inputs=[
@@ -105,7 +112,7 @@ demo = gr.ChatInterface(
105
  ),
106
  gr.Slider(
107
  minimum=0.1,
108
- maximum=1.0,
109
  value=0.1,
110
  step=0.1,
111
  label="Temperature"
@@ -120,4 +127,7 @@ demo = gr.ChatInterface(
120
  ],
121
  title="ROS2 Expert Assistant",
122
  description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.",
123
- )
 
 
 
 
 
1
  import spaces
2
+ import os
3
+ import gradio as gr
4
  import torch
5
+
6
+ from transformers import AutoTokenizer, TextStreamer, pipeline, AutoModelForCausalLM
 
 
 
 
 
 
 
7
  from langchain.embeddings import HuggingFaceInstructEmbeddings
8
  from langchain.vectorstores import Chroma
9
  from langchain.prompts import PromptTemplate
10
  from langchain.chains import RetrievalQA
11
  from langchain.llms import HuggingFacePipeline
 
12
 
13
+ # System prompts
14
+ DEFAULT_SYSTEM_PROMPT = """
15
+ Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
16
+ respond with "I don't know" or a similar acknowledgment that the answer is not available.
17
+ """.strip()
18
 
19
+ SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"
20
+
21
+ def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
22
+ return f"""
23
+ [INST] <<SYS>>
24
+ {system_prompt}
25
+ <</SYS>>
26
+ {prompt} [/INST]
27
+ """.strip()
28
+
29
+ template = generate_prompt(
30
+ """
31
+ {context}
32
+ Question: {question}
33
+ """,
34
+ system_prompt=SYSTEM_PROMPT,
35
+ )
36
+
37
+ prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])
38
+
39
+ # Initialize embeddings and database (CPU only)
40
+ embeddings = HuggingFaceInstructEmbeddings(
41
+ model_name="hkunlp/instructor-base",
42
+ model_kwargs={"device": "cpu"}
43
+ )
44
+
45
+ db = Chroma(
46
+ persist_directory="db",
47
+ embedding_function=embeddings
48
+ )
49
 
50
  def initialize_model():
51
  model_id = "meta-llama/Llama-3.2-3B-Instruct"
 
55
  model = AutoModelForCausalLM.from_pretrained(
56
  model_id,
57
  token=token,
58
+ device_map="auto"
59
  )
60
 
61
  return model, tokenizer
62
+
63
  @spaces.GPU
64
  def respond(message, history, system_message, max_tokens, temperature, top_p):
65
  try:
 
88
  )
89
 
90
  response = qa_chain.invoke({"query": message})
91
+ yield response["result"]
92
 
93
  except Exception as e:
94
+ yield f"An error occurred: {str(e)}"
 
95
 
96
+ # Create Gradio interface
97
  demo = gr.ChatInterface(
98
  respond,
99
  additional_inputs=[
 
112
  ),
113
  gr.Slider(
114
  minimum=0.1,
115
+ maximum=4.0,
116
  value=0.1,
117
  step=0.1,
118
  label="Temperature"
 
127
  ],
128
  title="ROS2 Expert Assistant",
129
  description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.",
130
+ )
131
+
132
+ if __name__ == "__main__":
133
+ demo.launch()