MadsGalsgaard commited on
Commit
4dd6ef8
·
verified ·
1 Parent(s): 09c9b33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -156
app.py CHANGED
@@ -267,172 +267,231 @@
267
 
268
  ####03 3.1 8b
269
 
270
- import os
271
- import time
272
- import spaces
273
- import torch
274
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
275
- import gradio as gr
276
- from threading import Thread
277
-
278
- MODEL_LIST = ["meta-llama/Meta-Llama-3.1-8B-Instruct"]
279
- HF_TOKEN = os.environ.get("HF_API_TOKEN",None)
280
- print(HF_TOKEN,"######$$$$$$$$$$$$$$$")
281
- MODEL = os.environ.get("MODEL_ID","meta-llama/Meta-Llama-3.1-8B-Instruct")
282
-
283
- TITLE = "<h1><center>Meta-Llama3.1-8B</center></h1>"
284
-
285
- PLACEHOLDER = """
286
- <center>
287
- <p>Hi! How can I help you today?</p>
288
- </center>
289
- """
290
-
291
-
292
- CSS = """
293
- .duplicate-button {
294
- margin: auto !important;
295
- color: white !important;
296
- background: black !important;
297
- border-radius: 100vh !important;
298
- }
299
- h3 {
300
- text-align: center;
301
- }
302
- """
303
-
304
- device = "cuda" # for GPU usage or "cpu" for CPU usage
305
-
306
- quantization_config = BitsAndBytesConfig(
307
- load_in_4bit=True,
308
- bnb_4bit_compute_dtype=torch.bfloat16,
309
- bnb_4bit_use_double_quant=True,
310
- bnb_4bit_quant_type= "nf4")
311
-
312
- tokenizer = AutoTokenizer.from_pretrained(MODEL)
313
- model = AutoModelForCausalLM.from_pretrained(
314
- MODEL,
315
- torch_dtype=torch.bfloat16,
316
- device_map="auto",
317
- quantization_config=quantization_config)
318
-
319
- @spaces.GPU()
320
- def stream_chat(
321
- message: str,
322
- history: list,
323
- system_prompt: str,
324
- temperature: float = 0.8,
325
- max_new_tokens: int = 1024,
326
- top_p: float = 1.0,
327
- top_k: int = 20,
328
- penalty: float = 1.2,
329
- ):
330
- print(f'message: {message}')
331
- print(f'history: {history}')
332
-
333
- conversation = [
334
- {"role": "system", "content": system_prompt}
335
- ]
336
- for prompt, answer in history:
337
- conversation.extend([
338
- {"role": "user", "content": prompt},
339
- {"role": "assistant", "content": answer},
340
- ])
341
 
342
- conversation.append({"role": "user", "content": message})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
- input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
345
 
346
- streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
347
 
348
- generate_kwargs = dict(
349
- input_ids=input_ids,
350
- max_new_tokens = max_new_tokens,
351
- do_sample = False if temperature == 0 else True,
352
- top_p = top_p,
353
- top_k = top_k,
354
- temperature = temperature,
355
- repetition_penalty=penalty,
356
- eos_token_id=[128001,128008,128009],
357
- streamer=streamer,
358
- )
359
 
360
- with torch.no_grad():
361
- thread = Thread(target=model.generate, kwargs=generate_kwargs)
362
- thread.start()
363
 
364
- buffer = ""
365
- for new_text in streamer:
366
- buffer += new_text
367
- yield buffer
368
 
369
 
370
- chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
371
-
372
- with gr.Blocks(css=CSS, theme="soft") as demo:
373
- gr.HTML(TITLE)
374
- gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
375
- gr.ChatInterface(
376
- fn=stream_chat,
377
- chatbot=chatbot,
378
- fill_height=True,
379
- additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
380
- additional_inputs=[
381
- gr.Textbox(
382
- value="You are a helpful assistant",
383
- label="System Prompt",
384
- render=False,
385
- ),
386
- gr.Slider(
387
- minimum=0,
388
- maximum=1,
389
- step=0.1,
390
- value=0.8,
391
- label="Temperature",
392
- render=False,
393
- ),
394
- gr.Slider(
395
- minimum=128,
396
- maximum=8192,
397
- step=1,
398
- value=1024,
399
- label="Max new tokens",
400
- render=False,
401
- ),
402
- gr.Slider(
403
- minimum=0.0,
404
- maximum=1.0,
405
- step=0.1,
406
- value=1.0,
407
- label="top_p",
408
- render=False,
409
- ),
410
- gr.Slider(
411
- minimum=1,
412
- maximum=20,
413
- step=1,
414
- value=20,
415
- label="top_k",
416
- render=False,
417
- ),
418
- gr.Slider(
419
- minimum=0.0,
420
- maximum=2.0,
421
- step=0.1,
422
- value=1.2,
423
- label="Repetition penalty",
424
- render=False,
425
- ),
426
- ],
427
- examples=[
428
- ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
429
- ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."],
430
- ["Tell me a random fun fact about the Roman Empire."],
431
- ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
432
- ],
433
- cache_examples=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
  )
 
 
 
 
 
 
435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
 
437
  if __name__ == "__main__":
438
  demo.launch()
 
267
 
268
  ####03 3.1 8b
269
 
270
+ # import os
271
+ # import time
272
+ # import spaces
273
+ # import torch
274
+ # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
275
+ # import gradio as gr
276
+ # from threading import Thread
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
+ # MODEL_LIST = ["meta-llama/Meta-Llama-3.1-8B-Instruct"]
279
+ # HF_TOKEN = os.environ.get("HF_API_TOKEN",None)
280
+ # print(HF_TOKEN,"######$$$$$$$$$$$$$$$")
281
+ # MODEL = os.environ.get("MODEL_ID","meta-llama/Meta-Llama-3.1-8B-Instruct")
282
+
283
+ # TITLE = "<h1><center>Meta-Llama3.1-8B</center></h1>"
284
+
285
+ # PLACEHOLDER = """
286
+ # <center>
287
+ # <p>Hi! How can I help you today?</p>
288
+ # </center>
289
+ # """
290
+
291
+
292
+ # CSS = """
293
+ # .duplicate-button {
294
+ # margin: auto !important;
295
+ # color: white !important;
296
+ # background: black !important;
297
+ # border-radius: 100vh !important;
298
+ # }
299
+ # h3 {
300
+ # text-align: center;
301
+ # }
302
+ # """
303
+
304
+ # device = "cuda" # for GPU usage or "cpu" for CPU usage
305
+
306
+ # quantization_config = BitsAndBytesConfig(
307
+ # load_in_4bit=True,
308
+ # bnb_4bit_compute_dtype=torch.bfloat16,
309
+ # bnb_4bit_use_double_quant=True,
310
+ # bnb_4bit_quant_type= "nf4")
311
+
312
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL)
313
+ # model = AutoModelForCausalLM.from_pretrained(
314
+ # MODEL,
315
+ # torch_dtype=torch.bfloat16,
316
+ # device_map="auto",
317
+ # quantization_config=quantization_config)
318
+
319
+ # @spaces.GPU()
320
+ # def stream_chat(
321
+ # message: str,
322
+ # history: list,
323
+ # system_prompt: str,
324
+ # temperature: float = 0.8,
325
+ # max_new_tokens: int = 1024,
326
+ # top_p: float = 1.0,
327
+ # top_k: int = 20,
328
+ # penalty: float = 1.2,
329
+ # ):
330
+ # print(f'message: {message}')
331
+ # print(f'history: {history}')
332
+
333
+ # conversation = [
334
+ # {"role": "system", "content": system_prompt}
335
+ # ]
336
+ # for prompt, answer in history:
337
+ # conversation.extend([
338
+ # {"role": "user", "content": prompt},
339
+ # {"role": "assistant", "content": answer},
340
+ # ])
341
+
342
+ # conversation.append({"role": "user", "content": message})
343
 
344
+ # input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
345
 
346
+ # streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
347
 
348
+ # generate_kwargs = dict(
349
+ # input_ids=input_ids,
350
+ # max_new_tokens = max_new_tokens,
351
+ # do_sample = False if temperature == 0 else True,
352
+ # top_p = top_p,
353
+ # top_k = top_k,
354
+ # temperature = temperature,
355
+ # repetition_penalty=penalty,
356
+ # eos_token_id=[128001,128008,128009],
357
+ # streamer=streamer,
358
+ # )
359
 
360
+ # with torch.no_grad():
361
+ # thread = Thread(target=model.generate, kwargs=generate_kwargs)
362
+ # thread.start()
363
 
364
+ # buffer = ""
365
+ # for new_text in streamer:
366
+ # buffer += new_text
367
+ # yield buffer
368
 
369
 
370
+ # chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
371
+
372
+ # with gr.Blocks(css=CSS, theme="soft") as demo:
373
+ # gr.HTML(TITLE)
374
+ # gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
375
+ # gr.ChatInterface(
376
+ # fn=stream_chat,
377
+ # chatbot=chatbot,
378
+ # fill_height=True,
379
+ # additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
380
+ # additional_inputs=[
381
+ # gr.Textbox(
382
+ # value="You are a helpful assistant",
383
+ # label="System Prompt",
384
+ # render=False,
385
+ # ),
386
+ # gr.Slider(
387
+ # minimum=0,
388
+ # maximum=1,
389
+ # step=0.1,
390
+ # value=0.8,
391
+ # label="Temperature",
392
+ # render=False,
393
+ # ),
394
+ # gr.Slider(
395
+ # minimum=128,
396
+ # maximum=8192,
397
+ # step=1,
398
+ # value=1024,
399
+ # label="Max new tokens",
400
+ # render=False,
401
+ # ),
402
+ # gr.Slider(
403
+ # minimum=0.0,
404
+ # maximum=1.0,
405
+ # step=0.1,
406
+ # value=1.0,
407
+ # label="top_p",
408
+ # render=False,
409
+ # ),
410
+ # gr.Slider(
411
+ # minimum=1,
412
+ # maximum=20,
413
+ # step=1,
414
+ # value=20,
415
+ # label="top_k",
416
+ # render=False,
417
+ # ),
418
+ # gr.Slider(
419
+ # minimum=0.0,
420
+ # maximum=2.0,
421
+ # step=0.1,
422
+ # value=1.2,
423
+ # label="Repetition penalty",
424
+ # render=False,
425
+ # ),
426
+ # ],
427
+ # examples=[
428
+ # ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
429
+ # ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."],
430
+ # ["Tell me a random fun fact about the Roman Empire."],
431
+ # ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
432
+ # ],
433
+ # cache_examples=False,
434
+ # )
435
+
436
+
437
+ # if __name__ == "__main__":
438
+ # demo.launch()
439
+
440
+
441
+
442
+ ###########new clientkey
443
+
444
+ import gradio as gr
445
+ from huggingface_hub import InferenceClient
446
+
447
+ # Hugging Face Inference Client setup
448
+ client = InferenceClient(
449
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct" # Replace with your actual token
450
+ )
451
+
452
+ # Function to interact with the Hugging Face model
453
+ def chat_with_model(message, history):
454
+ # Prepare conversation history for the model
455
+ conversation = [{"role": "system", "content": "You are a helpful assistant."}]
456
+
457
+ for past_message, past_response in history:
458
+ conversation.append({"role": "user", "content": past_message})
459
+ conversation.append({"role": "assistant", "content": past_response})
460
+
461
+ # Add new user message to the conversation
462
+ conversation.append({"role": "user", "content": message})
463
+
464
+ # Generate response using the Inference API
465
+ responses = client.chat_completion(
466
+ messages=conversation,
467
+ max_tokens=500,
468
+ stream=True
469
  )
470
+
471
+ # Capture streamed response
472
+ response_text = ""
473
+ for response in responses:
474
+ delta_content = response.choices[0].delta.content
475
+ response_text += delta_content
476
 
477
+ history.append((message, response_text))
478
+
479
+ return history, history # Update both chatbot history and visible chat
480
+
481
+ # Create Gradio interface
482
+ with gr.Blocks() as demo:
483
+ chatbot = gr.Chatbot(height=600)
484
+ msg_input = gr.Textbox(show_label=False, placeholder="Type your message...")
485
+
486
+ with gr.Row():
487
+ clear_btn = gr.Button("Clear Chat")
488
+
489
+ # Setting up interaction between user input and the chatbot
490
+ msg_input.submit(chat_with_model, [msg_input, chatbot], [chatbot, chatbot])
491
+ clear_btn.click(lambda: None, None, chatbot, queue=False)
492
+
493
+ gr.Markdown("## Llama 3.1 Chatbot")
494
 
495
+ # Launch Gradio demo
496
  if __name__ == "__main__":
497
  demo.launch()