|
import streamlit as st |
|
from streamlit_elements import elements, mui, editor, dashboard |
|
from stqdm import stqdm |
|
import textgrad as tg |
|
import os |
|
from PIL import Image |
|
from textgrad.autograd import MultimodalLLMCall |
|
from textgrad.loss import ImageQALoss |
|
from io import BytesIO |
|
|
|
class ImageQA: |
|
def __init__(self, data) -> None: |
|
self.data = data |
|
self.llm_engine = tg.get_engine("gpt-4o") |
|
print("="*50, "init", "="*50) |
|
self.loss_value = "" |
|
self.gradients = "" |
|
if 'iteration' not in st.session_state: |
|
st.session_state.iteration = 0 |
|
st.session_state.results = [] |
|
tg.set_backward_engine(self.llm_engine, override=True) |
|
|
|
def load_layout(self): |
|
st.markdown(f"**This is a solution optimization for image QA.**") |
|
col1, col2 = st.columns([1, 1]) |
|
with col1: |
|
uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"]) |
|
if uploaded_file is not None: |
|
image = Image.open(uploaded_file) |
|
st.image(image, caption="Uploaded Image") |
|
else: |
|
image_url = self.data["image_URL"] |
|
image = Image.open(image_url) |
|
st.image(image_url, caption="Default: MathVista image") |
|
|
|
img_byte_arr = BytesIO() |
|
image.save(img_byte_arr, format='PNG') |
|
img_byte_arr = img_byte_arr.getvalue() |
|
self.image_variable = tg.Variable(img_byte_arr, role_description="image to answer a question about", requires_grad=False) |
|
with col2: |
|
question_text = st.text_area("Question:", self.data["question_text"], height=150) |
|
self.question_variable = tg.Variable(question_text, role_description="question", requires_grad=False) |
|
self.evaluation_instruction_text = st.text_area("Evaluation instruction:", self.data["evaluation_instruction"], height=100) |
|
|
|
self.loss_fn = ImageQALoss( |
|
evaluation_instruction=self.evaluation_instruction_text, |
|
engine="gpt-4o", |
|
) |
|
if "current_response" not in st.session_state: |
|
st.session_state.current_response = "" |
|
|
|
|
|
def _run(self): |
|
|
|
self.response = MultimodalLLMCall("gpt-4o")([ |
|
self.image_variable, |
|
self.question_variable |
|
]) |
|
|
|
optimizer = tg.TGD(parameters=[self.response]) |
|
|
|
loss = self.loss_fn(question=self.question_variable, image=self.image_variable, response=self.response) |
|
self.loss_value = loss.value |
|
|
|
|
|
loss.backward() |
|
self.gradients = self.response.gradients |
|
|
|
optimizer.step() |
|
st.session_state.current_response = self.response.value |
|
|
|
def show_results(self): |
|
self._run() |
|
st.session_state.iteration += 1 |
|
st.session_state.results.append({ |
|
'iteration': st.session_state.iteration, |
|
'loss_value': self.loss_value, |
|
'response': self.response.value, |
|
'gradients': self.gradients |
|
}) |
|
|
|
tabs = st.tabs([f"Iteration {i+1}" for i in range(st.session_state.iteration)]) |
|
|
|
for i, tab in enumerate(tabs): |
|
with tab: |
|
result = st.session_state.results[i] |
|
st.markdown(f"Current iteration: **{result['iteration']}**") |
|
st.markdown("## Current solution:") |
|
st.markdown(result['response']) |
|
|
|
col1, col2 = st.columns([1, 1]) |
|
with col1: |
|
st.markdown("## Loss value") |
|
st.markdown(result['loss_value']) |
|
with col2: |
|
st.markdown("## Code gradients") |
|
for j, g in enumerate(result['gradients']): |
|
st.markdown(f"### Gradient") |
|
st.markdown(g.value) |