nisten commited on
Commit
32720ee
·
verified ·
1 Parent(s): 565c0e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -14
app.py CHANGED
@@ -10,7 +10,7 @@ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENT
10
 
11
  from transformers import OlmoeForCausalLM, AutoTokenizer
12
 
13
- model_name = "allenai/OLMoE-1B-7B-0924"
14
 
15
  # Wrap model loading in a try-except block to handle potential errors
16
  try:
@@ -22,7 +22,7 @@ try:
22
  low_cpu_mem_usage=True,
23
  device_map="auto",
24
  _attn_implementation="flash_attention_2" # Enable Flash Attention 2
25
- )
26
  model.gradient_checkpointing_enable() # Enable gradient checkpointing
27
  tokenizer = AutoTokenizer.from_pretrained(model_name)
28
  except Exception as e:
@@ -35,26 +35,33 @@ system_prompt = ("Adopt the persona of hilariously pissed off Andrej Karpathy "
35
  "while always answering questions in full first principles analysis type of thinking "
36
  "without using any analogies and always showing full working code or output in his answers.")
37
 
38
- chat_template = "<|system|>{system_message}<|end|><|user|>{user_message}<|end|><|assistant|>"
39
-
40
  @spaces.GPU
41
  def generate_response(message, history, temperature, max_new_tokens):
42
  if model is None or tokenizer is None:
43
  yield "Model or tokenizer not loaded properly. Please check the logs."
44
  return
45
 
46
- full_prompt = chat_template.format(system_message=system_prompt, user_message=message)
47
- inputs = tokenizer(full_prompt, return_tensors="pt").to(DEVICE)
 
 
 
 
 
 
48
 
49
  try:
50
  with torch.no_grad():
51
- streamer = tokenizer.stream(inputs.input_ids, model, temperature=temperature, max_new_tokens=max_new_tokens)
52
-
53
- collected_tokens = []
54
- for token in streamer:
55
- collected_tokens.append(token)
56
- partial_text = tokenizer.decode(collected_tokens, skip_special_tokens=True)
57
- yield partial_text.strip()
 
 
 
58
  except RuntimeError as e:
59
  if "CUDA out of memory" in str(e):
60
  yield "GPU memory exceeded. Try reducing the max tokens or using a smaller model."
@@ -97,5 +104,5 @@ with gr.Blocks(css=css) as demo:
97
  clear.click(lambda: None, None, chatbot, queue=False)
98
 
99
  if __name__ == "__main__":
100
- demo.queue(api_open=False, max_size=10) # Limiting queue size
101
  demo.launch(debug=True, show_api=True, share=False) # Disabled sharing for security
 
10
 
11
  from transformers import OlmoeForCausalLM, AutoTokenizer
12
 
13
+ model_name = "allenai/OLMoE-1B-7B-0924-Instruct"
14
 
15
  # Wrap model loading in a try-except block to handle potential errors
16
  try:
 
22
  low_cpu_mem_usage=True,
23
  device_map="auto",
24
  _attn_implementation="flash_attention_2" # Enable Flash Attention 2
25
+ ).to(DEVICE)
26
  model.gradient_checkpointing_enable() # Enable gradient checkpointing
27
  tokenizer = AutoTokenizer.from_pretrained(model_name)
28
  except Exception as e:
 
35
  "while always answering questions in full first principles analysis type of thinking "
36
  "without using any analogies and always showing full working code or output in his answers.")
37
 
 
 
38
  @spaces.GPU
39
  def generate_response(message, history, temperature, max_new_tokens):
40
  if model is None or tokenizer is None:
41
  yield "Model or tokenizer not loaded properly. Please check the logs."
42
  return
43
 
44
+ messages = [{"role": "system", "content": system_prompt}]
45
+ for user_msg, assistant_msg in history:
46
+ messages.append({"role": "user", "content": user_msg})
47
+ if assistant_msg:
48
+ messages.append({"role": "assistant", "content": assistant_msg})
49
+ messages.append({"role": "user", "content": message})
50
+
51
+ inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
52
 
53
  try:
54
  with torch.no_grad():
55
+ generated_ids = model.generate(
56
+ inputs,
57
+ max_new_tokens=max_new_tokens,
58
+ do_sample=True,
59
+ temperature=temperature,
60
+ eos_token_id=tokenizer.eos_token_id,
61
+ )
62
+
63
+ generated_text = tokenizer.decode(generated_ids[0, inputs.shape[1]:], skip_special_tokens=True)
64
+ yield generated_text.strip()
65
  except RuntimeError as e:
66
  if "CUDA out of memory" in str(e):
67
  yield "GPU memory exceeded. Try reducing the max tokens or using a smaller model."
 
104
  clear.click(lambda: None, None, chatbot, queue=False)
105
 
106
  if __name__ == "__main__":
107
+ demo.queue(api_open=True, max_size=10) # Limiting queue size
108
  demo.launch(debug=True, show_api=True, share=False) # Disabled sharing for security