Spaces:

mannadamay12
/

rag-ros2

Sleeping

App Files Files Community

mannadamay12 commited on Dec 7, 2024

Commit

26b862a

verified ·

1 Parent(s): ec21171

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -70

app.py CHANGED Viewed

@@ -3,91 +3,71 @@ import torch
 import gradio as gr
 import spaces
 from huggingface_hub import InferenceClient
-from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
-# Verify PyTorch version compatibility
-TORCH_VERSION = torch.__version__
-SUPPORTED_TORCH_VERSIONS = ['2.0.1', '2.1.2', '2.2.2', '2.4.0']
-if TORCH_VERSION.rsplit('+')[0] not in SUPPORTED_TORCH_VERSIONS:
-    print(f"Warning: Current PyTorch version {TORCH_VERSION} may not be compatible with ZeroGPU. "
-          f"Supported versions are: {', '.join(SUPPORTED_TORCH_VERSIONS)}")
-# Initialize components outside of GPU scope
-client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct")
-embeddings = HuggingFaceEmbeddings(
-    model_name="sentence-transformers/all-MiniLM-L6-v2",
-    model_kwargs={"device": "cpu"}  # Keep embeddings on CPU
 )
-# Load database
 db = Chroma(
     persist_directory="db",
     embedding_function=embeddings
 )
-# Prompt templates
-DEFAULT_SYSTEM_PROMPT = """
-Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
-respond with "I don't know" or a similar acknowledgment that the answer is not available.
-""".strip()
-def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
-    return f"""
-[INST] <<SYS>>
-{system_prompt}
-<</SYS>>
-{prompt} [/INST]
-""".strip()
-template = generate_prompt(
-    """
-{context}
-Question: {question}
-""",
-    system_prompt="Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"
 )
-prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])
-@spaces.GPU(duration=30)  # Reduced duration for faster queue priority
-def respond(
-    message,
-    history,
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    """GPU-accelerated response generation"""
     try:
-        # Retrieve context (CPU operation)
-        docs = db.similarity_search(message, k=2)
-        context = "\n".join([doc.page_content for doc in docs])
-        print(f"Retrieved context: {context[:200]}...")
-        # Format prompt
-        formatted_prompt = prompt_template.format(
-            context=context,
-            question=message
-        )
-        print(f"Full prompt: {formatted_prompt}")
-        # Stream response (GPU operation)
-        response = ""
-        for message in client.text_generation(
-            prompt=formatted_prompt,
-            max_new_tokens=max_tokens,
-            stream=True,
-            temperature=temperature,
-            top_p=top_p,
-        ):
-            response += message
-            yield response
     except Exception as e:
         yield f"An error occurred: {str(e)}"

 import gradio as gr
 import spaces
 from huggingface_hub import InferenceClient
+from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
+from langchain.chains import RetrievalQA
+from langchain.llms import HuggingFacePipeline
+from transformers import AutoTokenizer, TextStreamer, pipeline, BitsAndBytesConfig, AutoModelForCausalLM
+# Model initialization
+model_id = "meta-llama/Llama-3.2-3B-Instruct"
+token = os.environ.get("HF_TOKEN")
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    token=token,
+    quantization_config=bnb_config
+)
+# Initialize InstructEmbeddings
+embeddings = HuggingFaceInstructEmbeddings(
+    model_name="hkunlp/instructor-base",
+    model_kwargs={"device": "cpu"}
 )
 db = Chroma(
     persist_directory="db",
     embedding_function=embeddings
 )
+# Setup pipeline
+streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+text_pipeline = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_new_tokens=500,
+    temperature=0.1,
+    top_p=0.95,
+    repetition_penalty=1.15,
+    streamer=streamer,
 )
+# Create LLM chain
+llm = HuggingFacePipeline(pipeline=text_pipeline)
+qa_chain = RetrievalQA.from_chain_type(
+    llm=llm,
+    chain_type="stuff",
+    retriever=db.as_retriever(search_kwargs={"k": 2}),
+    return_source_documents=False,
+    chain_type_kwargs={"prompt": prompt_template}
+)
+@spaces.GPU(duration=30)
+def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
+        # Use the QA chain directly
+        response = qa_chain.invoke({"query": message})
+        yield response["result"]
     except Exception as e:
         yield f"An error occurred: {str(e)}"