mannadamay12 commited on
Commit
cbe2d25
·
verified ·
1 Parent(s): 5eddda9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -53
app.py CHANGED
@@ -1,41 +1,44 @@
1
  import os
2
- import spaces # Move this to the top
3
  import gradio as gr
4
- from huggingface_hub import InferenceClient
5
  from langchain.embeddings import HuggingFaceInstructEmbeddings
6
  from langchain.vectorstores import Chroma
7
  from langchain.prompts import PromptTemplate
8
  from langchain.chains import RetrievalQA
9
  from langchain.llms import HuggingFacePipeline
 
10
 
11
- import torch
12
- from transformers import AutoTokenizer, TextStreamer, pipeline, BitsAndBytesConfig, AutoModelForCausalLM
13
-
14
- TORCH_VERSION = torch.__version__
15
- SUPPORTED_TORCH_VERSIONS = ['2.0.1', '2.1.2', '2.2.2', '2.4.0']
16
- if TORCH_VERSION.rsplit('+')[0] not in SUPPORTED_TORCH_VERSIONS:
17
- print(f"Warning: Current PyTorch version {TORCH_VERSION} may not be compatible with ZeroGPU. "
18
- f"Supported versions are: {', '.join(SUPPORTED_TORCH_VERSIONS)}")
19
-
20
- # Model initialization
21
- model_id = "meta-llama/Llama-3.2-3B-Instruct"
22
- token = os.environ.get("HF_TOKEN")
23
-
24
- bnb_config = BitsAndBytesConfig(
25
- load_in_4bit=True,
26
- bnb_4bit_use_double_quant=True,
27
- bnb_4bit_quant_type="nf4",
28
- bnb_4bit_compute_dtype=torch.bfloat16
29
- )
30
-
31
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
32
- model = AutoModelForCausalLM.from_pretrained(
33
- model_id,
34
- token=token,
35
- quantization_config=bnb_config
36
- )
 
 
 
37
 
38
- # Initialize InstructEmbeddings
39
  embeddings = HuggingFaceInstructEmbeddings(
40
  model_name="hkunlp/instructor-base",
41
  model_kwargs={"device": "cpu"}
@@ -46,35 +49,36 @@ db = Chroma(
46
  embedding_function=embeddings
47
  )
48
 
49
- # Setup pipeline
50
- streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
51
- text_pipeline = pipeline(
52
- "text-generation",
53
- model=model,
54
- tokenizer=tokenizer,
55
- max_new_tokens=500,
56
- temperature=0.1,
57
- top_p=0.95,
58
- repetition_penalty=1.15,
59
- streamer=streamer,
60
- )
61
-
62
- # Create LLM chain
63
- llm = HuggingFacePipeline(pipeline=text_pipeline)
64
- qa_chain = RetrievalQA.from_chain_type(
65
- llm=llm,
66
- chain_type="stuff",
67
- retriever=db.as_retriever(search_kwargs={"k": 2}),
68
- return_source_documents=False,
69
- chain_type_kwargs={"prompt": prompt_template}
70
- )
71
-
72
  @spaces.GPU(duration=30)
73
  def respond(message, history, system_message, max_tokens, temperature, top_p):
74
  try:
75
- # Use the QA chain directly
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  response = qa_chain.invoke({"query": message})
77
  yield response["result"]
 
78
  except Exception as e:
79
  yield f"An error occurred: {str(e)}"
80
 
 
1
  import os
2
+ import spaces # First import
3
  import gradio as gr
 
4
  from langchain.embeddings import HuggingFaceInstructEmbeddings
5
  from langchain.vectorstores import Chroma
6
  from langchain.prompts import PromptTemplate
7
  from langchain.chains import RetrievalQA
8
  from langchain.llms import HuggingFacePipeline
9
+ from huggingface_hub import InferenceClient
10
 
11
+ # GPU initialization moved into a function
12
+ def initialize_model():
13
+ import torch
14
+ from transformers import (
15
+ AutoTokenizer,
16
+ TextStreamer,
17
+ pipeline,
18
+ BitsAndBytesConfig,
19
+ AutoModelForCausalLM
20
+ )
21
+
22
+ model_id = "meta-llama/Llama-3.2-3B-Instruct"
23
+ token = os.environ.get("HF_TOKEN")
24
+
25
+ bnb_config = BitsAndBytesConfig(
26
+ load_in_4bit=True,
27
+ bnb_4bit_use_double_quant=True,
28
+ bnb_4bit_quant_type="nf4",
29
+ bnb_4bit_compute_dtype=torch.bfloat16
30
+ )
31
+
32
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
33
+ model = AutoModelForCausalLM.from_pretrained(
34
+ model_id,
35
+ token=token,
36
+ quantization_config=bnb_config
37
+ )
38
+
39
+ return model, tokenizer
40
 
41
+ # Initialize non-GPU components
42
  embeddings = HuggingFaceInstructEmbeddings(
43
  model_name="hkunlp/instructor-base",
44
  model_kwargs={"device": "cpu"}
 
49
  embedding_function=embeddings
50
  )
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  @spaces.GPU(duration=30)
53
  def respond(message, history, system_message, max_tokens, temperature, top_p):
54
  try:
55
+ # Initialize model components inside the GPU scope
56
+ model, tokenizer = initialize_model()
57
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
58
+
59
+ text_pipeline = pipeline(
60
+ "text-generation",
61
+ model=model,
62
+ tokenizer=tokenizer,
63
+ max_new_tokens=max_tokens,
64
+ temperature=temperature,
65
+ top_p=top_p,
66
+ repetition_penalty=1.15,
67
+ streamer=streamer,
68
+ )
69
+
70
+ llm = HuggingFacePipeline(pipeline=text_pipeline)
71
+ qa_chain = RetrievalQA.from_chain_type(
72
+ llm=llm,
73
+ chain_type="stuff",
74
+ retriever=db.as_retriever(search_kwargs={"k": 2}),
75
+ return_source_documents=False,
76
+ chain_type_kwargs={"prompt": prompt_template}
77
+ )
78
+
79
  response = qa_chain.invoke({"query": message})
80
  yield response["result"]
81
+
82
  except Exception as e:
83
  yield f"An error occurred: {str(e)}"
84