Spaces:

mannadamay12
/

rag-ros2

Running

App Files Files Community

mannadamay12 commited on Dec 7, 2024

Commit

cbe2d25

verified ·

1 Parent(s): 5eddda9

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -53

app.py CHANGED Viewed

@@ -1,41 +1,44 @@
 import os
-import spaces  # Move this to the top
 import gradio as gr
-from huggingface_hub import InferenceClient
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
 from langchain.llms import HuggingFacePipeline
-import torch
-from transformers import AutoTokenizer, TextStreamer, pipeline, BitsAndBytesConfig, AutoModelForCausalLM
-TORCH_VERSION = torch.__version__
-SUPPORTED_TORCH_VERSIONS = ['2.0.1', '2.1.2', '2.2.2', '2.4.0']
-if TORCH_VERSION.rsplit('+')[0] not in SUPPORTED_TORCH_VERSIONS:
-    print(f"Warning: Current PyTorch version {TORCH_VERSION} may not be compatible with ZeroGPU. "
-          f"Supported versions are: {', '.join(SUPPORTED_TORCH_VERSIONS)}")
-# Model initialization
-model_id = "meta-llama/Llama-3.2-3B-Instruct"
-token = os.environ.get("HF_TOKEN")
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
-tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    token=token,
-    quantization_config=bnb_config
-)
-# Initialize InstructEmbeddings
 embeddings = HuggingFaceInstructEmbeddings(
     model_name="hkunlp/instructor-base",
     model_kwargs={"device": "cpu"}
@@ -46,35 +49,36 @@ db = Chroma(
     embedding_function=embeddings
 )
-# Setup pipeline
-streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-text_pipeline = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    max_new_tokens=500,
-    temperature=0.1,
-    top_p=0.95,
-    repetition_penalty=1.15,
-    streamer=streamer,
-)
-# Create LLM chain
-llm = HuggingFacePipeline(pipeline=text_pipeline)
-qa_chain = RetrievalQA.from_chain_type(
-    llm=llm,
-    chain_type="stuff",
-    retriever=db.as_retriever(search_kwargs={"k": 2}),
-    return_source_documents=False,
-    chain_type_kwargs={"prompt": prompt_template}
-)
 @spaces.GPU(duration=30)
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
-        # Use the QA chain directly
         response = qa_chain.invoke({"query": message})
         yield response["result"]
     except Exception as e:
         yield f"An error occurred: {str(e)}"

 import os
+import spaces  # First import
 import gradio as gr
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
 from langchain.llms import HuggingFacePipeline
+from huggingface_hub import InferenceClient
+# GPU initialization moved into a function
+def initialize_model():
+    import torch
+    from transformers import (
+        AutoTokenizer,
+        TextStreamer,
+        pipeline,
+        BitsAndBytesConfig,
+        AutoModelForCausalLM
+    )
+    model_id = "meta-llama/Llama-3.2-3B-Instruct"
+    token = os.environ.get("HF_TOKEN")
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        token=token,
+        quantization_config=bnb_config
+    )
+    return model, tokenizer
+# Initialize non-GPU components
 embeddings = HuggingFaceInstructEmbeddings(
     model_name="hkunlp/instructor-base",
     model_kwargs={"device": "cpu"}
     embedding_function=embeddings
 )
 @spaces.GPU(duration=30)
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
+        # Initialize model components inside the GPU scope
+        model, tokenizer = initialize_model()
+        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        text_pipeline = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            repetition_penalty=1.15,
+            streamer=streamer,
+        )
+        llm = HuggingFacePipeline(pipeline=text_pipeline)
+        qa_chain = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=db.as_retriever(search_kwargs={"k": 2}),
+            return_source_documents=False,
+            chain_type_kwargs={"prompt": prompt_template}
+        )
         response = qa_chain.invoke({"query": message})
         yield response["result"]
     except Exception as e:
         yield f"An error occurred: {str(e)}"