Spaces:

mannadamay12
/

rag-ros2

Running

App Files Files Community

mannadamay12 commited on Dec 7, 2024

Commit

988c5f2

verified ·

1 Parent(s): ca64dfe

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -39

app.py CHANGED Viewed

@@ -1,44 +1,51 @@
-import os
 import spaces
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# import torch
-# from transformers import (
-#     AutoTokenizer,
-#     TextStreamer,
-#     pipeline,
-#     BitsAndBytesConfig,
-#     AutoModelForCausalLM
-# )
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
 from langchain.llms import HuggingFacePipeline
-import gradio as gr
-DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
-model_id = "meta-llama/Llama-3.2-3B-Instruct"
-# Remove the spaces.GPU decorator since we'll handle GPU directly
-# def initialize_model():
-#     bnb_config = BitsAndBytesConfig(
-#         load_in_4bit=True,
-#         bnb_4bit_use_double_quant=True,
-#         bnb_4bit_quant_type="nf4",
-#         bnb_4bit_compute_dtype=torch.bfloat16
-#     )
-#     tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))
-#     model = AutoModelForCausalLM.from_pretrained(
-#         model_id,
-#         token=os.environ.get("HF_TOKEN"),
-#         quantization_config=bnb_config if torch.cuda.is_available() else None,
-#         device_map="auto" if torch.cuda.is_available() else "cpu",
-#         torch_dtype=torch.float32 if not torch.cuda.is_available() else None
-#     )
-#     return model, tokenizer
 def initialize_model():
     model_id = "meta-llama/Llama-3.2-3B-Instruct"
@@ -48,11 +55,11 @@ def initialize_model():
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         token=token,
-        device_map="auto"  # This works better with ZeroGPU
     )
     return model, tokenizer
 @spaces.GPU
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
@@ -81,12 +88,12 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
         )
         response = qa_chain.invoke({"query": message})
-        return response["result"]
     except Exception as e:
-        return f"An error occurred: {str(e)}"
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
@@ -105,7 +112,7 @@ demo = gr.ChatInterface(
         ),
         gr.Slider(
             minimum=0.1,
-            maximum=1.0,
             value=0.1,
             step=0.1,
             label="Temperature"
@@ -120,4 +127,7 @@ demo = gr.ChatInterface(
     ],
     title="ROS2 Expert Assistant",
     description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.",
-)

 import spaces
+import os
+import gradio as gr
 import torch
+from transformers import AutoTokenizer, TextStreamer, pipeline, AutoModelForCausalLM
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
 from langchain.llms import HuggingFacePipeline
+# System prompts
+DEFAULT_SYSTEM_PROMPT = """
+Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
+respond with "I don't know" or a similar acknowledgment that the answer is not available.
+""".strip()
+SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"
+def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
+    return f"""
+[INST] <<SYS>>
+{system_prompt}
+<</SYS>>
+{prompt} [/INST]
+""".strip()
+template = generate_prompt(
+    """
+{context}
+Question: {question}
+""",
+    system_prompt=SYSTEM_PROMPT,
+)
+prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])
+# Initialize embeddings and database (CPU only)
+embeddings = HuggingFaceInstructEmbeddings(
+    model_name="hkunlp/instructor-base",
+    model_kwargs={"device": "cpu"}
+)
+db = Chroma(
+    persist_directory="db",
+    embedding_function=embeddings
+)
 def initialize_model():
     model_id = "meta-llama/Llama-3.2-3B-Instruct"
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         token=token,
+        device_map="auto"
     )
     return model, tokenizer
 @spaces.GPU
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
         )
         response = qa_chain.invoke({"query": message})
+        yield response["result"]
     except Exception as e:
+        yield f"An error occurred: {str(e)}"
+# Create Gradio interface
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
         ),
         gr.Slider(
             minimum=0.1,
+            maximum=4.0,
             value=0.1,
             step=0.1,
             label="Temperature"
     ],
     title="ROS2 Expert Assistant",
     description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.",
+)
+if __name__ == "__main__":
+    demo.launch()