khulnasoft commited on
Commit
5a8bfb5
·
verified ·
1 Parent(s): d204a44

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -43
app.py CHANGED
@@ -1,78 +1,105 @@
1
  import os
2
  from threading import Thread
3
- from typing import Iterator
4
 
5
  import gradio as gr
6
- import spaces
7
  import torch
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
 
 
10
  MAX_MAX_NEW_TOKENS = 2048
11
  DEFAULT_MAX_NEW_TOKENS = 1024
12
-
13
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
14
 
15
  DESCRIPTION = """\
16
  # DeepCode-6.7B-Chat
17
-
18
- This Space demonstrates model [DeepCode-AI](https://huggingface.co/deepcode-ai/deepcode-ai-6.7b-instruct) by DeepCode, a code model with 6.7B parameters fine-tuned for chat instructions.
19
  """
20
 
21
  if not torch.cuda.is_available():
22
  DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
23
-
24
-
25
- if torch.cuda.is_available():
26
  model_id = "deepcode-ai/deepcode-ai-6.7b-instruct"
27
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
 
 
28
  tokenizer = AutoTokenizer.from_pretrained(model_id)
29
  tokenizer.use_default_system_prompt = False
30
-
31
 
32
 
33
- @spaces.GPU
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def generate(
35
  message: str,
36
- chat_history: list,
37
  system_prompt: str,
38
- max_new_tokens: int = 1024,
39
  temperature: float = 0.6,
40
  top_p: float = 0.9,
41
  top_k: int = 50,
42
- repetition_penalty: float = 1,
43
  ) -> Iterator[str]:
44
- conversation = []
45
- if system_prompt:
46
- conversation.append({"role": "system", "content": system_prompt})
47
- for user, assistant in chat_history:
48
- conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
49
- conversation.append({"role": "user", "content": message})
50
 
51
- input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt", add_generation_prompt=True)
52
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
53
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
54
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
55
- input_ids = input_ids.to(model.device)
56
 
57
- streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
 
 
58
  generate_kwargs = dict(
59
- {"input_ids": input_ids},
60
  streamer=streamer,
61
  max_new_tokens=max_new_tokens,
62
  do_sample=False,
63
  num_beams=1,
64
  repetition_penalty=repetition_penalty,
65
- eos_token_id=tokenizer.eos_token_id
66
  )
67
  t = Thread(target=model.generate, kwargs=generate_kwargs)
68
  t.start()
69
 
70
  outputs = []
71
- for text in streamer:
72
- outputs.append(text)
73
- yield "".join(outputs).replace("<|EOT|>","")
 
 
 
74
 
75
 
 
76
  chat_interface = gr.ChatInterface(
77
  fn=generate,
78
  additional_inputs=[
@@ -84,13 +111,6 @@ chat_interface = gr.ChatInterface(
84
  step=1,
85
  value=DEFAULT_MAX_NEW_TOKENS,
86
  ),
87
- # gr.Slider(
88
- # label="Temperature",
89
- # minimum=0,
90
- # maximum=4.0,
91
- # step=0.1,
92
- # value=0,
93
- # ),
94
  gr.Slider(
95
  label="Top-p (nucleus sampling)",
96
  minimum=0.05,
@@ -110,14 +130,13 @@ chat_interface = gr.ChatInterface(
110
  minimum=1.0,
111
  maximum=2.0,
112
  step=0.05,
113
- value=1,
114
  ),
115
  ],
116
- stop_btn=None,
117
  examples=[
118
- ["implement snake game using pygame"],
119
- ["Can you explain briefly to me what is the Python programming language?"],
120
- ["write a program to find the factorial of a number"],
121
  ],
122
  )
123
 
 
1
  import os
2
  from threading import Thread
3
+ from typing import Iterator, List, Tuple
4
 
5
  import gradio as gr
 
6
  import torch
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
8
 
9
+ # Constants
10
  MAX_MAX_NEW_TOKENS = 2048
11
  DEFAULT_MAX_NEW_TOKENS = 1024
 
12
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
13
 
14
  DESCRIPTION = """\
15
  # DeepCode-6.7B-Chat
16
+ This Space demonstrates model [DeepCode-AI](https://huggingface.co/deepcode-ai/deepcode-ai-6.7b-instruct)
17
+ by DeepCode, a code model with 6.7B parameters fine-tuned for chat instructions.
18
  """
19
 
20
  if not torch.cuda.is_available():
21
  DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
22
+ model = None
23
+ else:
 
24
  model_id = "deepcode-ai/deepcode-ai-6.7b-instruct"
25
+ model = AutoModelForCausalLM.from_pretrained(
26
+ model_id, torch_dtype=torch.bfloat16, device_map="auto"
27
+ )
28
  tokenizer = AutoTokenizer.from_pretrained(model_id)
29
  tokenizer.use_default_system_prompt = False
 
30
 
31
 
32
+ def trim_input_ids(input_ids: torch.Tensor) -> torch.Tensor:
33
+ """
34
+ Trim input_ids to fit within the MAX_INPUT_TOKEN_LENGTH.
35
+ """
36
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
37
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
38
+ gr.Warning(f"Trimmed input as it exceeded {MAX_INPUT_TOKEN_LENGTH} tokens.")
39
+ return input_ids
40
+
41
+
42
+ def build_conversation(message: str, chat_history: List[Tuple[str, str]], system_prompt: str) -> List[dict]:
43
+ """
44
+ Build the conversation structure for the chat model.
45
+ """
46
+ conversation = []
47
+ if system_prompt:
48
+ conversation.append({"role": "system", "content": system_prompt})
49
+ for user, assistant in chat_history:
50
+ conversation.extend([
51
+ {"role": "user", "content": user},
52
+ {"role": "assistant", "content": assistant}
53
+ ])
54
+ conversation.append({"role": "user", "content": message})
55
+ return conversation
56
+
57
+
58
  def generate(
59
  message: str,
60
+ chat_history: List[Tuple[str, str]],
61
  system_prompt: str,
62
+ max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
63
  temperature: float = 0.6,
64
  top_p: float = 0.9,
65
  top_k: int = 50,
66
+ repetition_penalty: float = 1.0,
67
  ) -> Iterator[str]:
68
+ if model is None:
69
+ yield "GPU is unavailable. This demo does not run on CPU."
70
+ return
 
 
 
71
 
72
+ conversation = build_conversation(message, chat_history, system_prompt)
73
+ input_ids = tokenizer.apply_chat_template(
74
+ conversation, return_tensors="pt", add_generation_prompt=True
75
+ )
76
+ input_ids = trim_input_ids(input_ids.to(model.device))
77
 
78
+ streamer = TextIteratorStreamer(
79
+ tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
80
+ )
81
  generate_kwargs = dict(
82
+ input_ids=input_ids,
83
  streamer=streamer,
84
  max_new_tokens=max_new_tokens,
85
  do_sample=False,
86
  num_beams=1,
87
  repetition_penalty=repetition_penalty,
88
+ eos_token_id=tokenizer.eos_token_id,
89
  )
90
  t = Thread(target=model.generate, kwargs=generate_kwargs)
91
  t.start()
92
 
93
  outputs = []
94
+ try:
95
+ for text in streamer:
96
+ outputs.append(text)
97
+ yield "".join(outputs).replace("<|EOT|>", "")
98
+ except Exception as e:
99
+ yield f"Error during generation: {e}"
100
 
101
 
102
+ # Gradio Interface
103
  chat_interface = gr.ChatInterface(
104
  fn=generate,
105
  additional_inputs=[
 
111
  step=1,
112
  value=DEFAULT_MAX_NEW_TOKENS,
113
  ),
 
 
 
 
 
 
 
114
  gr.Slider(
115
  label="Top-p (nucleus sampling)",
116
  minimum=0.05,
 
130
  minimum=1.0,
131
  maximum=2.0,
132
  step=0.05,
133
+ value=1.0,
134
  ),
135
  ],
 
136
  examples=[
137
+ ["Implement snake game using pygame"],
138
+ ["Can you explain what the Python programming language is?"],
139
+ ["Write a program to find the factorial of a number"],
140
  ],
141
  )
142