File size: 4,589 Bytes
4f7c605
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
from transformers import TextIteratorStreamer, AutoModelForCausalLM, AutoProcessor
from threading import Thread
import re
import time
from PIL import Image
import torch
import argparse
import spaces

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='aya')
args = parser.parse_args()

model_name = args.model

processor = AutoProcessor.from_pretrained(f"WueNLP/centurio_{model_name}", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(f"WueNLP/centurio_{model_name}",
                                             trust_remote_code=True,
                                             torch_dtype=torch.bfloat16,
                                             low_cpu_mem_usage=True
                                             ).to("cuda:0")

@spaces.GPU
def bot_streaming(message, history):
    if message["files"]:
        image = message["files"][-1]
    else:
        # if there's no image uploaded for this turn, look for images in the past turns
        # kept inside tuples, take the last one
        for hist in history:
            if type(hist[0]) == tuple:
                image = hist[0][0]

    if "qwen" in model_name:
        if image is None:
            prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{message['text']}<|im_end|>\n<|im_start|>assistant\n"
        else:
            image = Image.open(image).convert("RGB")
            prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image_placeholder>\n{message['text']}<|im_end|>\n<|im_start|>assistant\n"
    else:
        if image is None:
            prompt = f"<BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>{message['text']}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
        else:
            image = Image.open(image).convert("RGB")
            prompt = f"<BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|><image_placeholder>\n{message['text']}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"

    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda:0", torch.bfloat16)

    streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": False})
    generation_kwargs = dict(inputs, streamer=streamer,
                             do_sample=True,
                             num_beams=1,
                             repetition_penalty=1.15,
                             temperature=0.7,
                             top_p=0.8,
                             top_k=20,
                             max_new_tokens=512, min_new_tokens=1)

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    buffer = ""
    for new_text in streamer:
        buffer += new_text
        if "qwen" in model_name:
            generated_text_without_prompt = buffer.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0]
        else:
            generated_text_without_prompt = buffer.split("<|CHATBOT_TOKEN|>")[-1].split("<|END_OF_TURN_TOKEN|>")[0]

        time.sleep(0.04)
        yield generated_text_without_prompt


description = ("""# [Centurio: On Drivers of Multilingual Ability of Large Vision-Language Model](gregor-ge.github.io/Centurio/)

Try [Centurio](https://huggingface.co/collections/WueNLP/centurio-677cf0ab6ddea874927a154e), a massively multilingual large vision-language model, in this demo (specifically, [Centurio Aya](https://huggingface.co/WueNLP/centurio_aya)). 

Upload an image and start chatting about it, or try one of the examples below. 



Centurio is trained with 100 languages but quality of answers can differ greatly depending on your language. 

Centurio is trained to read text in images but struggles with small text and with non-Latin scripts.



> If you don't upload an image, you will receive an error.

> This demo does not support multi-image prompts or multi-turn dialog. Every new prompt will refer to the last image (if no new image is included) without prior dialog as context.""")

demo = gr.ChatInterface(fn=bot_streaming, title="Centurio Demo",
                        examples=[{"text": "What is on the flower?", "files": ["./bee.jpg"]},
                                  {"text": "How to make this pastry?", "files": ["./baklava.png"]}],
                        description=description,
                        stop_btn="Stop Generation",
                        multimodal=True
                        )
demo.launch(debug=True, share=True)