EITD commited on
Commit
385e0d5
·
1 Parent(s): 4866329

custom respond

Browse files
Files changed (1) hide show
  1. app.py +29 -13
app.py CHANGED
@@ -1,12 +1,18 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
- import os
 
4
 
5
  """
6
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
7
  """
8
- client = InferenceClient("EITD/lora_model", token=os.getenv("HF_TOKEN"))
9
 
 
 
 
 
 
10
 
11
  def respond(
12
  message,
@@ -26,18 +32,28 @@ def respond(
26
 
27
  messages.append({"role": "user", "content": message})
28
 
29
- response = ""
30
 
31
- for message in client.chat_completion(
32
- messages,
33
- max_tokens=max_tokens,
34
- stream=True,
35
- temperature=temperature,
36
- top_p=top_p,
37
- ):
38
- token = message.choices[0].delta.content
39
 
40
- response += token
 
 
 
 
 
 
 
 
 
 
41
  yield response
42
 
43
 
 
1
  import gradio as gr
2
+ # from huggingface_hub import InferenceClient
3
+ from peft import AutoPeftModelForCausalLM
4
+ from transformers import AutoTokenizer, TextStreamer
5
 
6
  """
7
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
8
  """
9
+ # client = InferenceClient("EITD/lora_model", token=os.getenv("HF_TOKEN"))
10
 
11
+ model_name = "EITD/lora_model"
12
+ model = AutoPeftModelForCausalLM.from_pretrained(
13
+ model_name
14
+ )
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
 
17
  def respond(
18
  message,
 
32
 
33
  messages.append({"role": "user", "content": message})
34
 
35
+ # response = ""
36
 
37
+ # for message in client.chat_completion(
38
+ # messages,
39
+ # max_tokens=max_tokens,
40
+ # stream=True,
41
+ # temperature=temperature,
42
+ # top_p=top_p,
43
+ # ):
44
+ # token = message.choices[0].delta.content
45
 
46
+ # response += token
47
+ # yield response
48
+ inputs = tokenizer.apply_chat_template(
49
+ messages,
50
+ tokenize = True,
51
+ add_generation_prompt = True, # Must add for generation
52
+ return_tensors = "pt",
53
+ )
54
+ text_streamer = TextStreamer(tokenizer, skip_prompt = True)
55
+ for response in model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = max_tokens, use_cache = True,
56
+ temperature = temperature, min_p = top_p):
57
  yield response
58
 
59