import gradio as gr import torch from models.multimodel_phi import MultimodalPhiModel from utils.audio_processing import transcribe_speech from utils.image_processing import getImageArray from utils.text_processing import getStringAfter, getAnswerPart, getInputs from config import device, model_location, base_phi_model, tokenizer model = MultimodalPhiModel.from_pretrained(model_location).to(device) def output_parser(image_path, audio_path, context_text, question, max_length=3): answerPart = "" speech_text = "" if image_path is not None: for i in range(max_length): start_tokens, end_tokens, image_features, attention_mask = getInputs(image_path, question, answer=answerPart) output = model(start_tokens, end_tokens, image_features, attention_mask, labels=None) tokens = output.logits.argmax(dim=-1) output = tokenizer.decode(tokens[0], skip_special_tokens=True) answerPart = getAnswerPart(output) print("Answerpart:", answerPart) if audio_path is not None: speech_text = transcribe_speech(audio_path) print("Speech Text:", speech_text) if (question is None) or (question == ""): question = " Describe the objects and their relationships in 1 sentence." input_text = ( "<|system|>\n Please understand the context " "and answer the question in 1 or 2 summarized sentences.\n" f"<|end|>\n<|user|>\n<|context|> {answerPart} \n {speech_text} \n {context_text} " f"\n<|question|>: {question} \n<|end|>\n<|assistant|>\n" ) print("input_text:", input_text) tokens = tokenizer(input_text, padding=True, truncation=True, max_length=1024, return_tensors="pt") start_tokens = tokens['input_ids'].to(device) attention_mask = tokens['attention_mask'].to(device) output_text = tokenizer.decode( base_phi_model.generate(start_tokens, attention_mask=attention_mask, max_length=1024, do_sample=False, pad_token_id=tokenizer.pad_token_id)[0], skip_special_tokens=True ) output_text = getStringAfter(output_text, question).strip() return output_text # Gradio interface setup title = "Created Fine Tuned MultiModal model" description = "Test the fine tuned multimodal model created using clip, phi3.5 mini instruct, whisper models" def process_chat_input(history, message, audio): image_path = next((file for file in message["files"] if file.split('.')[-1].lower() in ['jpg', 'jpeg', 'png', 'gif']), None) audio_path = next((file for file in message["files"] if file.split('.')[-1].lower() in ['mp3', 'wav', 'ogg']), None) or audio question = message["text"] response = output_parser(image_path, audio_path, "", question) if image_path: history.append({"role": "user", "content": {"path": image_path}}) if audio_path: history.append({"role": "user", "content": {"path": audio_path}}) if question: history.append({"role": "user", "content": question}) history.append({"role": "assistant", "content": ""}) for char in response: history[-1]["content"] += char yield history, "" custom_theme = gr.themes.Base( primary_hue="gray", secondary_hue="gray", neutral_hue="gray", font=["Helvetica", "ui-sans-serif", "system-ui", "sans-serif"], ).set( body_background_fill="#000000", body_text_color="#ffffff", color_accent_soft="*neutral_600", background_fill_primary="#111111", background_fill_secondary="#222222", border_color_accent="*neutral_700", button_primary_background_fill="*neutral_800", button_primary_text_color="#ffffff", # Add these lines to ensure all text is white block_title_text_color="#ffffff", block_label_text_color="#ffffff" ) with gr.Blocks(theme=custom_theme) as demo: with gr.Row(): gr.Markdown("# Eden") gr.Markdown("Chat with the fine-tuned multimodal Eden using text, audio, or image inputs.") chatbot = gr.Chatbot( elem_id="chatbot", bubble_full_width=False, height=450, type="messages" ) chat_input = gr.MultimodalTextbox( interactive=True, file_count="multiple", placeholder="Enter your message, upload an image, or upload an audio file...", show_label=False, file_types=["image", "audio"], container=False, scale=3, lines=1 ) gr.Markdown("Or record a message:") audio_input = gr.Audio(type="filepath", sources=["microphone", "upload"]) chat_input.submit( process_chat_input, [chatbot, chat_input, audio_input], [chatbot, chat_input] ).then(lambda: gr.MultimodalTextbox(interactive=True, lines=1), None, [chat_input]) gr.Examples( examples=[ "Describe the objects in the image.", "What can you hear in the audio?", "Summarize the context provided.", ], inputs=chat_input, ) demo.launch(debug=True)