Eden-Multimodal / app.py
Himank Jain
removed logo
ef36e39
import gradio as gr
import torch
from models.multimodel_phi import MultimodalPhiModel
from utils.audio_processing import transcribe_speech
from utils.image_processing import getImageArray
from utils.text_processing import getStringAfter, getAnswerPart, getInputs
from config import device, model_location, base_phi_model, tokenizer
model = MultimodalPhiModel.from_pretrained(model_location).to(device)
def output_parser(image_path, audio_path, context_text, question, max_length=3):
answerPart = ""
speech_text = ""
if image_path is not None:
for i in range(max_length):
start_tokens, end_tokens, image_features, attention_mask = getInputs(image_path, question, answer=answerPart)
output = model(start_tokens, end_tokens, image_features, attention_mask, labels=None)
tokens = output.logits.argmax(dim=-1)
output = tokenizer.decode(tokens[0], skip_special_tokens=True)
answerPart = getAnswerPart(output)
print("Answerpart:", answerPart)
if audio_path is not None:
speech_text = transcribe_speech(audio_path)
print("Speech Text:", speech_text)
if (question is None) or (question == ""):
question = " Describe the objects and their relationships in 1 sentence."
input_text = (
"<|system|>\n Please understand the context "
"and answer the question in 1 or 2 summarized sentences.\n"
f"<|end|>\n<|user|>\n<|context|> {answerPart} \n {speech_text} \n {context_text} "
f"\n<|question|>: {question} \n<|end|>\n<|assistant|>\n"
)
print("input_text:", input_text)
tokens = tokenizer(input_text, padding=True, truncation=True, max_length=1024, return_tensors="pt")
start_tokens = tokens['input_ids'].to(device)
attention_mask = tokens['attention_mask'].to(device)
output_text = tokenizer.decode(
base_phi_model.generate(start_tokens, attention_mask=attention_mask, max_length=1024, do_sample=False, pad_token_id=tokenizer.pad_token_id)[0],
skip_special_tokens=True
)
output_text = getStringAfter(output_text, question).strip()
return output_text
# Gradio interface setup
title = "Created Fine Tuned MultiModal model"
description = "Test the fine tuned multimodal model created using clip, phi3.5 mini instruct, whisper models"
def process_chat_input(history, message, audio):
image_path = next((file for file in message["files"] if file.split('.')[-1].lower() in ['jpg', 'jpeg', 'png', 'gif']), None)
audio_path = next((file for file in message["files"] if file.split('.')[-1].lower() in ['mp3', 'wav', 'ogg']), None) or audio
question = message["text"]
response = output_parser(image_path, audio_path, "", question)
if image_path:
history.append({"role": "user", "content": {"path": image_path}})
if audio_path:
history.append({"role": "user", "content": {"path": audio_path}})
if question:
history.append({"role": "user", "content": question})
history.append({"role": "assistant", "content": ""})
for char in response:
history[-1]["content"] += char
yield history, ""
custom_theme = gr.themes.Base(
primary_hue="gray",
secondary_hue="gray",
neutral_hue="gray",
font=["Helvetica", "ui-sans-serif", "system-ui", "sans-serif"],
).set(
body_background_fill="#000000",
body_text_color="#ffffff",
color_accent_soft="*neutral_600",
background_fill_primary="#111111",
background_fill_secondary="#222222",
border_color_accent="*neutral_700",
button_primary_background_fill="*neutral_800",
button_primary_text_color="#ffffff",
# Add these lines to ensure all text is white
block_title_text_color="#ffffff",
block_label_text_color="#ffffff"
)
with gr.Blocks(theme=custom_theme) as demo:
with gr.Row():
gr.Markdown("# Eden")
gr.Markdown("Chat with the fine-tuned multimodal Eden using text, audio, or image inputs.")
chatbot = gr.Chatbot(
elem_id="chatbot",
bubble_full_width=False,
height=450,
type="messages"
)
chat_input = gr.MultimodalTextbox(
interactive=True,
file_count="multiple",
placeholder="Enter your message, upload an image, or upload an audio file...",
show_label=False,
file_types=["image", "audio"],
container=False,
scale=3,
lines=1
)
gr.Markdown("Or record a message:")
audio_input = gr.Audio(type="filepath", sources=["microphone", "upload"])
chat_input.submit(
process_chat_input,
[chatbot, chat_input, audio_input],
[chatbot, chat_input]
).then(lambda: gr.MultimodalTextbox(interactive=True, lines=1), None, [chat_input])
gr.Examples(
examples=[
"Describe the objects in the image.",
"What can you hear in the audio?",
"Summarize the context provided.",
],
inputs=chat_input,
)
demo.launch(debug=True)