MadsGalsgaard commited on
Commit
9c32687
·
verified ·
1 Parent(s): a4baa70

new_updated

Browse files
Files changed (1) hide show
  1. app.py +116 -35
app.py CHANGED
@@ -444,83 +444,164 @@ import os
444
  import spaces
445
  from huggingface_hub import login
446
  login(token=os.getenv("HF_API_TOKEN"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
448
- model = MllamaForConditionalGeneration.from_pretrained(ckpt,
449
- torch_dtype=torch.bfloat16).to("cuda")
450
  processor = AutoProcessor.from_pretrained(ckpt)
451
 
452
-
453
  @spaces.GPU
454
  def bot_streaming(message, history, max_new_tokens=250):
455
-
456
  txt = message["text"]
457
  ext_buffer = f"{txt}"
458
 
459
- messages= []
460
  images = []
461
 
462
-
463
  for i, msg in enumerate(history):
464
  if isinstance(msg[0], tuple):
465
  messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
466
  messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
467
  images.append(Image.open(msg[0][0]).convert("RGB"))
468
  elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
469
- # messages are already handled
470
- pass
471
- elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
472
  messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
473
  messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
474
 
475
- # add current message
476
  if len(message["files"]) == 1:
477
-
478
- if isinstance(message["files"][0], str): # examples
479
  image = Image.open(message["files"][0]).convert("RGB")
480
- else: # regular input
481
  image = Image.open(message["files"][0]["path"]).convert("RGB")
482
  images.append(image)
483
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
484
  else:
485
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
486
 
487
-
488
  texts = processor.apply_chat_template(messages, add_generation_prompt=True)
489
 
490
- if images == []:
491
  inputs = processor(text=texts, return_tensors="pt").to("cuda")
492
  else:
493
  inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
 
494
  streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
495
 
496
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
497
  generated_text = ""
498
 
 
499
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
500
  thread.start()
501
  buffer = ""
502
 
503
  for new_text in streamer:
504
  buffer += new_text
505
- generated_text_without_prompt = buffer
506
- time.sleep(0.01)
507
  yield buffer
508
 
509
-
510
- demo = gr.ChatInterface(fn=bot_streaming, title="Multimodal Llama",
511
- textbox=gr.MultimodalTextbox(),
512
- additional_inputs = [gr.Slider(
513
- minimum=10,
514
- maximum=500,
515
- value=250,
516
- step=10,
517
- label="Maximum number of new tokens to generate",
518
- )
519
- ],
520
- cache_examples=False,
521
- description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply try one of the examples below. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32). ",
522
- stop_btn="Stop Generation",
523
- fill_height=True,
524
- multimodal=True)
525
-
526
- demo.launch(debug=True)
 
 
 
 
 
444
  import spaces
445
  from huggingface_hub import login
446
  login(token=os.getenv("HF_API_TOKEN"))
447
+ # ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
448
+ # model = MllamaForConditionalGeneration.from_pretrained(ckpt,
449
+ # torch_dtype=torch.bfloat16).to("cuda")
450
+ # processor = AutoProcessor.from_pretrained(ckpt)
451
+
452
+
453
+ # @spaces.GPU
454
+ # def bot_streaming(message, history, max_new_tokens=250):
455
+
456
+ # txt = message["text"]
457
+ # ext_buffer = f"{txt}"
458
+
459
+ # messages= []
460
+ # images = []
461
+
462
+
463
+ # for i, msg in enumerate(history):
464
+ # if isinstance(msg[0], tuple):
465
+ # messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
466
+ # messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
467
+ # images.append(Image.open(msg[0][0]).convert("RGB"))
468
+ # elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
469
+ # # messages are already handled
470
+ # pass
471
+ # elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
472
+ # messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
473
+ # messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
474
+
475
+ # # add current message
476
+ # if len(message["files"]) == 1:
477
+
478
+ # if isinstance(message["files"][0], str): # examples
479
+ # image = Image.open(message["files"][0]).convert("RGB")
480
+ # else: # regular input
481
+ # image = Image.open(message["files"][0]["path"]).convert("RGB")
482
+ # images.append(image)
483
+ # messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
484
+ # else:
485
+ # messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
486
+
487
+
488
+ # texts = processor.apply_chat_template(messages, add_generation_prompt=True)
489
+
490
+ # if images == []:
491
+ # inputs = processor(text=texts, return_tensors="pt").to("cuda")
492
+ # else:
493
+ # inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
494
+ # streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
495
+
496
+ # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
497
+ # generated_text = ""
498
+
499
+ # thread = Thread(target=model.generate, kwargs=generation_kwargs)
500
+ # thread.start()
501
+ # buffer = ""
502
+
503
+ # for new_text in streamer:
504
+ # buffer += new_text
505
+ # generated_text_without_prompt = buffer
506
+ # time.sleep(0.01)
507
+ # yield buffer
508
+
509
+
510
+ # demo = gr.ChatInterface(fn=bot_streaming, title="Multimodal Llama",
511
+ # textbox=gr.MultimodalTextbox(),
512
+ # additional_inputs = [gr.Slider(
513
+ # minimum=10,
514
+ # maximum=500,
515
+ # value=250,
516
+ # step=10,
517
+ # label="Maximum number of new tokens to generate",
518
+ # )
519
+ # ],
520
+ # cache_examples=False,
521
+ # description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply try one of the examples below. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32). ",
522
+ # stop_btn="Stop Generation",
523
+ # fill_height=True,
524
+ # multimodal=True)
525
+
526
+ # demo.launch(debug=True,live=True)
527
+
528
  ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
529
+ model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda")
 
530
  processor = AutoProcessor.from_pretrained(ckpt)
531
 
 
532
  @spaces.GPU
533
  def bot_streaming(message, history, max_new_tokens=250):
 
534
  txt = message["text"]
535
  ext_buffer = f"{txt}"
536
 
537
+ messages = []
538
  images = []
539
 
540
+ # Process history messages
541
  for i, msg in enumerate(history):
542
  if isinstance(msg[0], tuple):
543
  messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
544
  messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
545
  images.append(Image.open(msg[0][0]).convert("RGB"))
546
  elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
547
+ pass # Previous messages already handled
548
+ elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # Text-only turn
 
549
  messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
550
  messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
551
 
552
+ # Add current message
553
  if len(message["files"]) == 1:
554
+ if isinstance(message["files"][0], str): # Example images
 
555
  image = Image.open(message["files"][0]).convert("RGB")
556
+ else: # Regular input
557
  image = Image.open(message["files"][0]["path"]).convert("RGB")
558
  images.append(image)
559
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
560
  else:
561
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
562
 
563
+ # Prepare input for the model
564
  texts = processor.apply_chat_template(messages, add_generation_prompt=True)
565
 
566
+ if not images:
567
  inputs = processor(text=texts, return_tensors="pt").to("cuda")
568
  else:
569
  inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
570
+
571
  streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
572
 
573
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
574
  generated_text = ""
575
 
576
+ # Start text generation in a separate thread
577
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
578
  thread.start()
579
  buffer = ""
580
 
581
  for new_text in streamer:
582
  buffer += new_text
583
+ time.sleep(0.01) # Small delay to simulate streaming
 
584
  yield buffer
585
 
586
+ # Gradio interface setup
587
+ demo = gr.ChatInterface(
588
+ fn=bot_streaming,
589
+ title="Multimodal Llama",
590
+ textbox=gr.MultimodalTextbox(),
591
+ additional_inputs=[
592
+ gr.Slider(
593
+ minimum=10,
594
+ maximum=500,
595
+ value=250,
596
+ step=10,
597
+ label="Maximum number of new tokens to generate",
598
+ )
599
+ ],
600
+ cache_examples=False,
601
+ description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply type your question.",
602
+ stop_btn="Stop Generation",
603
+ fill_height=True,
604
+ multimodal=True
605
+ )
606
+
607
+ demo.launch(debug=True,live=True)