Spaces:
Runtime error
Runtime error
VictorSanh
commited on
Commit
ยท
0e509eb
1
Parent(s):
e26a679
Update visualization
Browse files- app_dialogue.py +63 -17
app_dialogue.py
CHANGED
@@ -15,9 +15,9 @@ from text_generation import Client
|
|
15 |
from transformers import AutoProcessor
|
16 |
|
17 |
|
18 |
-
MODELS = [
|
19 |
"HuggingFaceM4/idefics-9b-instruct",
|
20 |
-
"HuggingFaceM4/idefics-80b-instruct",
|
21 |
]
|
22 |
|
23 |
API_PATHS = {
|
@@ -66,7 +66,7 @@ API_TOKEN = os.getenv("HF_AUTH_TOKEN")
|
|
66 |
IDEFICS_LOGO = "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/IDEFICS_logo.png"
|
67 |
|
68 |
PROCESSOR = AutoProcessor.from_pretrained(
|
69 |
-
"HuggingFaceM4/idefics-
|
70 |
token=API_TOKEN,
|
71 |
)
|
72 |
|
@@ -314,7 +314,6 @@ textbox = gr.Textbox(
|
|
314 |
visible=True,
|
315 |
container=False,
|
316 |
label="Text input",
|
317 |
-
scale = 6
|
318 |
)
|
319 |
with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
|
320 |
gr.HTML("""<h1 align="center">๐ถ IDEFICS Playground - EMBARGO UNTIL AUGUST 22ND</h1>""") # TODO remove embargo
|
@@ -326,7 +325,7 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
|
|
326 |
**EMBARGO UNTIL AUGUST 22ND** This demo showcaes **IDEFICS**, a open-access large visual lanugage model. Like GPT-4, the multimodal model accepts arbitrary sequences of image and text inputs and produces text outputs. IDEFICS can answer questions about images, describe visual content, create stories grounded in multiple images, etc.
|
327 |
<br>IDEFICS (which stans for **I**mage-aware **D**ecoder **E**nhanced ร la **F**lamingo with **I**nterleaved **C**ross-attention**S**) is an open-access reproduction of [Flamingo](https://huggingface.co/papers/2204.14198), a closed-source visual language model developed by Deepmind. IDEFICS was built solely on publicly available data and models. It is currently the only visual language model of this scale available in open-access.
|
328 |
|
329 |
-
๐ The variants available in this demo were fine-tuned on a mixture of supervised and intruction fine-tuning to make the models more suitable in conversational settings. For more details, we refer to our [blog post](
|
330 |
|
331 |
๐
ฟ๏ธ **Intended uses:** This demo along with the [supporting models](https://huggingface.co/models?sort=trending&search=HuggingFaceM4%2Fidefics) are provided as research artefacts to the community. We detail misuses and out-of-scope uses [here](https://huggingface.co/HuggingFaceM4/idefics-80b#misuse-and-out-of-scope-use).
|
332 |
|
@@ -384,10 +383,15 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
|
|
384 |
|
385 |
with gr.Group():
|
386 |
with gr.Row():
|
|
|
387 |
textbox.render()
|
|
|
388 |
submit_btn = gr.Button(value="โถ๏ธ Submit", visible=True)
|
|
|
389 |
clear_btn = gr.ClearButton([textbox, imagebox, chatbot], value="๐งน Clear")
|
|
|
390 |
regenerate_btn = gr.Button(value="๐ Regenerate", visible=True)
|
|
|
391 |
upload_btn = gr.UploadButton("๐ Upload image", file_types=["image"])
|
392 |
# with gr.Group():
|
393 |
# with gr.Row():
|
@@ -548,18 +552,60 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
|
|
548 |
acc_text = ""
|
549 |
|
550 |
def process_example(message, image):
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
561 |
)
|
562 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
563 |
|
564 |
textbox.submit(
|
565 |
fn=model_inference,
|
@@ -789,7 +835,7 @@ with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
|
|
789 |
inputs=[textbox, imagebox],
|
790 |
outputs=[textbox, imagebox, chatbot],
|
791 |
fn=process_example,
|
792 |
-
cache_examples=
|
793 |
examples_per_page=6,
|
794 |
label=(
|
795 |
"Click on any example below to get started.\nFor convenience, the model generations have been"
|
|
|
15 |
from transformers import AutoProcessor
|
16 |
|
17 |
|
18 |
+
MODELS = [ # TODO uncomment
|
19 |
"HuggingFaceM4/idefics-9b-instruct",
|
20 |
+
# "HuggingFaceM4/idefics-80b-instruct",
|
21 |
]
|
22 |
|
23 |
API_PATHS = {
|
|
|
66 |
IDEFICS_LOGO = "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/IDEFICS_logo.png"
|
67 |
|
68 |
PROCESSOR = AutoProcessor.from_pretrained(
|
69 |
+
"HuggingFaceM4/idefics-9b-instruct",
|
70 |
token=API_TOKEN,
|
71 |
)
|
72 |
|
|
|
314 |
visible=True,
|
315 |
container=False,
|
316 |
label="Text input",
|
|
|
317 |
)
|
318 |
with gr.Blocks(title="IDEFICS Playground", theme=gr.themes.Base()) as demo:
|
319 |
gr.HTML("""<h1 align="center">๐ถ IDEFICS Playground - EMBARGO UNTIL AUGUST 22ND</h1>""") # TODO remove embargo
|
|
|
325 |
**EMBARGO UNTIL AUGUST 22ND** This demo showcaes **IDEFICS**, a open-access large visual lanugage model. Like GPT-4, the multimodal model accepts arbitrary sequences of image and text inputs and produces text outputs. IDEFICS can answer questions about images, describe visual content, create stories grounded in multiple images, etc.
|
326 |
<br>IDEFICS (which stans for **I**mage-aware **D**ecoder **E**nhanced ร la **F**lamingo with **I**nterleaved **C**ross-attention**S**) is an open-access reproduction of [Flamingo](https://huggingface.co/papers/2204.14198), a closed-source visual language model developed by Deepmind. IDEFICS was built solely on publicly available data and models. It is currently the only visual language model of this scale available in open-access.
|
327 |
|
328 |
+
๐ The variants available in this demo were fine-tuned on a mixture of supervised and intruction fine-tuning to make the models more suitable in conversational settings. For more details, we refer to our [blog post](TODO).
|
329 |
|
330 |
๐
ฟ๏ธ **Intended uses:** This demo along with the [supporting models](https://huggingface.co/models?sort=trending&search=HuggingFaceM4%2Fidefics) are provided as research artefacts to the community. We detail misuses and out-of-scope uses [here](https://huggingface.co/HuggingFaceM4/idefics-80b#misuse-and-out-of-scope-use).
|
331 |
|
|
|
383 |
|
384 |
with gr.Group():
|
385 |
with gr.Row():
|
386 |
+
with gr.Column(scale=0.6):
|
387 |
textbox.render()
|
388 |
+
with gr.Column(scale=0.1, min_width=80):
|
389 |
submit_btn = gr.Button(value="โถ๏ธ Submit", visible=True)
|
390 |
+
with gr.Column(scale=0.1, min_width=0):
|
391 |
clear_btn = gr.ClearButton([textbox, imagebox, chatbot], value="๐งน Clear")
|
392 |
+
with gr.Column(scale=0.1, min_width=0):
|
393 |
regenerate_btn = gr.Button(value="๐ Regenerate", visible=True)
|
394 |
+
with gr.Column(scale=0.1, min_width=0):
|
395 |
upload_btn = gr.UploadButton("๐ Upload image", file_types=["image"])
|
396 |
# with gr.Group():
|
397 |
# with gr.Row():
|
|
|
552 |
acc_text = ""
|
553 |
|
554 |
def process_example(message, image):
|
555 |
+
"""
|
556 |
+
Same as `model_inference` but in greedy mode and with the 80b-instruct.
|
557 |
+
Specifically for pre-computing the default examples.
|
558 |
+
"""
|
559 |
+
model_selector="HuggingFaceM4/idefics-80b-instruct"
|
560 |
+
user_prompt_str=message
|
561 |
+
chat_history=[]
|
562 |
+
decoding_strategy="Greedy"
|
563 |
+
max_new_tokens=512
|
564 |
+
|
565 |
+
formated_prompt_list, user_prompt_list = format_user_prompt_with_im_history_and_system_conditioning(
|
566 |
+
current_user_prompt_str=user_prompt_str.strip(),
|
567 |
+
current_image=image,
|
568 |
+
history=chat_history,
|
569 |
+
)
|
570 |
+
|
571 |
+
client_endpoint = API_PATHS[model_selector]
|
572 |
+
client = Client(
|
573 |
+
base_url=client_endpoint,
|
574 |
+
headers={"x-use-cache": "0", "Authorization": f"Bearer {API_TOKEN}"},
|
575 |
)
|
576 |
+
|
577 |
+
# Common parameters to all decoding strategies
|
578 |
+
# This documentation is useful to read: https://huggingface.co/docs/transformers/main/en/generation_strategies
|
579 |
+
generation_args = {
|
580 |
+
"max_new_tokens": max_new_tokens,
|
581 |
+
"repetition_penalty": None,
|
582 |
+
"stop_sequences": EOS_STRINGS,
|
583 |
+
"do_sample": False,
|
584 |
+
}
|
585 |
+
|
586 |
+
if image is None:
|
587 |
+
# Case where there is no image OR the image is passed as `<fake_token_around_image><image:IMAGE_URL><fake_token_around_image>`
|
588 |
+
chat_history.append([prompt_list_to_markdown(user_prompt_list), ASSISTANT_PREPEND])
|
589 |
+
else:
|
590 |
+
# Case where the image is passed through the Image Box.
|
591 |
+
# Convert the image into base64 for both passing it through the chat history and
|
592 |
+
# displaying the image inside the same bubble as the text.
|
593 |
+
chat_history.append(
|
594 |
+
[
|
595 |
+
f"{prompt_list_to_markdown([image] + user_prompt_list)}",
|
596 |
+
ASSISTANT_PREPEND,
|
597 |
+
]
|
598 |
+
)
|
599 |
+
|
600 |
+
query = prompt_list_to_tgi_input(formated_prompt_list)
|
601 |
+
generated_text = client.generate(prompt=query, **generation_args)
|
602 |
+
if generated_text.endswith("\nUser"):
|
603 |
+
generated_text = generate_text[:-5]
|
604 |
+
|
605 |
+
last_turn = chat_history.pop(-1)
|
606 |
+
last_turn[-1] += generated_text
|
607 |
+
chat_history.append(last_turn)
|
608 |
+
return "", None, chat_history
|
609 |
|
610 |
textbox.submit(
|
611 |
fn=model_inference,
|
|
|
835 |
inputs=[textbox, imagebox],
|
836 |
outputs=[textbox, imagebox, chatbot],
|
837 |
fn=process_example,
|
838 |
+
cache_examples=True,
|
839 |
examples_per_page=6,
|
840 |
label=(
|
841 |
"Click on any example below to get started.\nFor convenience, the model generations have been"
|