import json import re import random from collections import defaultdict from datetime import datetime import hashlib import gradio as gr from dotenv import load_dotenv load_dotenv() from gen_api_answer import ( get_model_response, parse_model_response, prometheus_parse_model_response, atla_parse_model_response, flow_judge_parse_model_response ) from random_sample_generation import ( get_random_human_ai_pair, get_random_human_ai_ground_truth_pair, generate_ai_response ) from db import add_vote, create_db_connection, get_votes from utils import Vote from common import ( POLICY_CONTENT, ACKNOWLEDGEMENTS, CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS, ) from prompts import ( DEFAULT_EVAL_PROMPT, DEFAULT_EVAL_PROMPT_EDITABLE, FIXED_EVAL_SUFFIX, DEFAULT_EVAL_CRITERIA, DEFAULT_SCORE_1, DEFAULT_SCORE_2, DEFAULT_SCORE_3, DEFAULT_SCORE_4, DEFAULT_SCORE_5, ) from leaderboard import ( get_leaderboard, get_leaderboard_stats, get_model_rankings, DEFAULT_ELO, K_FACTOR ) elo_scores = defaultdict(lambda: DEFAULT_ELO) vote_counts = defaultdict(int) db = create_db_connection() votes_collection = get_votes(db) current_time = datetime.now() # Load the model_data from JSONL def load_model_data(): model_data = {} try: with open("data/models.jsonl", "r") as f: for line in f: model = json.loads(line) model_data[model["name"]] = { "organization": model["organization"], "license": model["license"], "api_model": model["api_model"], } except FileNotFoundError: print("Warning: models.jsonl not found") return {} return model_data model_data = load_model_data() def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id): prompt_value = prompt.value if hasattr(prompt, 'value') else prompt vote = Vote( timestamp=datetime.now().isoformat(), prompt=prompt_value, response_a=response_a, response_b=response_b, model_a=model_a, model_b=model_b, winner=winner, judge_id=judge_id, ) add_vote(vote, db) def parse_variables(prompt): # Extract variables enclosed in double curly braces variables = re.findall(r"{{(.*?)}}", prompt) # Remove duplicates while preserving order seen = set() variables = [ x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip())) ] return variables def get_final_prompt(eval_prompt, variable_values): # Replace variables in the eval prompt with their values for var, val in variable_values.items(): eval_prompt = eval_prompt.replace("{{" + var + "}}", val) return eval_prompt def get_ip(request: gr.Request) -> str: """Get and hash the IP address from the request.""" if "cf-connecting-ip" in request.headers: ip = request.headers["cf-connecting-ip"] elif "x-forwarded-for" in request.headers: ip = request.headers["x-forwarded-for"] if "," in ip: ip = ip.split(",")[0] else: ip = request.client.host # Hash the IP address for privacy return hashlib.sha256(ip.encode()).hexdigest()[:16] def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]: """Generate appropriate message based on vote and model rankings. Returns (title, message) tuple.""" # Get current rankings voting_data = get_current_votes() leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True) rankings = get_model_rankings(leaderboard) pos_a = rankings.get(model_a, 0) pos_b = rankings.get(model_b, 0) if choice == "Tie": return "It's a tie!", "Keep voting responsibly 🤗" # Check if vote aligns with leaderboard if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a): return "The favourite wins!", "Keep voting responsibly 🤗" else: return "The underdog wins!", "Keep voting responsibly 🤗" def vote( choice, model_a, model_b, final_prompt, score_a, critique_a, score_b, critique_b, request: gr.Request, ): # Get hashed IP as judge_id judge_id = get_ip(request) # Update ELO scores based on user choice elo_a = elo_scores[model_a] elo_b = elo_scores[model_b] # Calculate expected scores Ea = 1 / (1 + 10 ** ((elo_b - elo_a) / 400)) Eb = 1 / (1 + 10 ** ((elo_a - elo_b) / 400)) # Assign actual scores if choice == "A": Sa, Sb = 1, 0 elif choice == "B": Sa, Sb = 0, 1 else: Sa, Sb = 0.5, 0.5 # Update scores and vote counts elo_scores[model_a] += K_FACTOR * (Sa - Ea) elo_scores[model_b] += K_FACTOR * (Sb - Eb) vote_counts[model_a] += 1 vote_counts[model_b] += 1 # Format the full responses with score and critique response_a = f"""{score_a} {critique_a}""" response_b = f"""{score_b} {critique_b}""" # Store the vote data with the final prompt store_vote_data( final_prompt, response_a, response_b, model_a, model_b, choice, judge_id ) # Get model positions for display voting_data = get_current_votes() leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True) rankings = get_model_rankings(leaderboard) pos_a = rankings.get(model_a, 0) pos_b = rankings.get(model_b, 0) # Format model names with positions and win/loss indicators if choice == "Tie": model_a_display = f"*Model: {model_a} (Position #{pos_a})*" model_b_display = f"*Model: {model_b} (Position #{pos_b})*" else: winner = model_a if choice == "A" else model_b loser = model_b if choice == "A" else model_a winner_pos = pos_a if choice == "A" else pos_b loser_pos = pos_b if choice == "A" else pos_a model_a_display = f"*Model: {model_a} {'✅' if choice == 'A' else '❌'} (Position #{pos_a})*" model_b_display = f"*Model: {model_b} {'✅' if choice == 'B' else '❌'} (Position #{pos_b})*" # Generate vote message title, message = get_vote_message(choice, model_a, model_b) return [ gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie gr.update(value=model_a_display), # model_name_a gr.update(value=model_b_display), # model_name_b gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn gr.update(value="🎲 New round", variant="primary"), # random_btn gr.Info(message, title=title), # success message ] def get_current_votes(): """Get current votes from database.""" return get_votes(db) # Update the refresh_leaderboard function def refresh_leaderboard(show_preliminary): """Refresh the leaderboard data and stats.""" voting_data = get_current_votes() leaderboard = get_leaderboard(model_data, voting_data, show_preliminary) data = [ [ entry["Model"], float(entry["ELO Score"]), entry["95% CI"], entry["# Votes"], entry["Organization"], entry["License"], ] for entry in leaderboard ] stats = get_leaderboard_stats(model_data, voting_data) return [gr.update(value=data), gr.update(value=stats)] # Update the leaderboard table definition in the UI leaderboard_table = gr.Dataframe( headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"], datatype=["str", "number", "str", "number", "str", "str", "str"], ) def populate_random_example(request: gr.Request, compatible_mode: bool): """Generate a random human-AI conversation example and reset judge outputs.""" if compatible_mode: # Generate all three components when compatible mode is enabled human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() else: # Generate only human and AI messages when compatible mode is disabled human_msg, ai_msg = get_random_human_ai_pair() ground_truth_msg = "" return [ gr.update(value=human_msg), gr.update(value=ai_msg), gr.update(value="🎲", variant="secondary"), # Reset random button appearance gr.update(value=""), # Clear score A gr.update(value=""), # Clear critique A gr.update(value=""), # Clear score B gr.update(value=""), # Clear critique B gr.update(interactive=False, variant="primary"), # Reset vote A gr.update(interactive=False, variant="primary"), # Reset vote B gr.update(interactive=False, variant="primary"), # Reset vote tie gr.update(value="*Model: Hidden*"), # Reset model name A gr.update(value="*Model: Hidden*"), # Reset model name B gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility ] with gr.Blocks(theme="default", css=CSS_STYLES) as demo: gr.Markdown(MAIN_TITLE) gr.Markdown(HOW_IT_WORKS) # Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT eval_prompt = gr.Textbox( value=DEFAULT_EVAL_PROMPT, visible=False ) with gr.Tabs(): with gr.TabItem("Judge Arena"): with gr.Row(): # Left side - Input section with gr.Column(scale=1): with gr.Group(): human_input = gr.TextArea( label="👩 User Input", lines=10, placeholder="Enter the human message here..." ) with gr.Row(): generate_btn = gr.Button( "Generate AI Response", size="sm", interactive=False ) ai_response = gr.TextArea( label="🤖 AI Response", lines=15, placeholder="Enter the AI response here..." ) # Ground truth response (initially hidden) ground_truth = gr.TextArea( label="🎯 Ground truth response", lines=12, placeholder="Enter the ground truth response here...", visible=False ) with gr.Row(): random_btn = gr.Button("🎲", scale=2) send_btn = gr.Button( value="Run judges", variant="primary", size="lg", scale=8 ) # Right side - Model outputs with gr.Column(scale=1): gr.Markdown("### 👩‍⚖️ Judge A") with gr.Group(): model_name_a = gr.Markdown("*Model: Hidden*") with gr.Row(): with gr.Column(scale=1, min_width=100): # Fixed narrow width for score score_a = gr.Textbox(label="Score", lines=6, interactive=False) vote_a = gr.Button("Vote A", variant="primary", interactive=False) with gr.Column(scale=9, min_width=400): # Wider width for critique critique_a = gr.TextArea(label="Critique", lines=8, interactive=False) # Tie button row with gr.Row() as tie_button_row: with gr.Column(): vote_tie = gr.Button("Tie", variant="primary", interactive=False) gr.Markdown("### 🧑‍⚖️ Judge B") with gr.Group(): model_name_b = gr.Markdown("*Model: Hidden*") with gr.Row(): with gr.Column(scale=1, min_width=100): # Fixed narrow width for score score_b = gr.Textbox(label="Score", lines=6, interactive=False) vote_b = gr.Button("Vote B", variant="primary", interactive=False) with gr.Column(scale=9, min_width=400): # Wider width for critique critique_b = gr.TextArea(label="Critique", lines=8, interactive=False) # Place Vote B button directly under Judge B gr.Markdown("
") # Replace the "Edit Judge Prompt" Accordion section with: with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion: gr.Markdown("
") use_reference_toggle = gr.Checkbox( label="Use a reference response", value=False ) # Hide the default prompt editor with gr.Column(visible=False) as default_prompt_editor: eval_prompt_editable = gr.TextArea( value=DEFAULT_EVAL_PROMPT_EDITABLE, label="Evaluation Criteria", lines=12 ) with gr.Row(visible=False) as edit_buttons_row: cancel_prompt_btn = gr.Button("Cancel") save_prompt_btn = gr.Button("Save", variant="primary") gr.Markdown("*The sample being evaluated is always appended as:*") gr.Markdown(f"```{FIXED_EVAL_SUFFIX}") # Show the compatible mode editor with gr.Column(visible=True) as compatible_prompt_editor: with gr.Row(): # Left column - Evaluation Criteria with gr.Column(scale=1): eval_criteria_text = gr.TextArea( label="Evaluation Criteria", lines=12, value=DEFAULT_EVAL_CRITERIA, placeholder="Enter the evaluation criteria..." ) prometheus_reference = gr.Markdown( "
*By default, we use the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*", visible=True ) # Right column - Score Descriptions with gr.Column(scale=1): score1_description = gr.TextArea( label="Score 1", value=DEFAULT_SCORE_1, placeholder="Description for score 1", lines=2 ) score2_description = gr.TextArea( label="Score 2", value=DEFAULT_SCORE_2, placeholder="Description for score 2", lines=2 ) score3_description = gr.TextArea( label="Score 3", value=DEFAULT_SCORE_3, placeholder="Description for score 3", lines=2 ) score4_description = gr.TextArea( label="Score 4", value=DEFAULT_SCORE_4, placeholder="Description for score 4", lines=2 ) score5_description = gr.TextArea( label="Score 5", value=DEFAULT_SCORE_5, placeholder="Description for score 5", lines=2 ) # Add save/cancel buttons for compatible mode with gr.Row(visible=False) as compatible_edit_buttons_row: compatible_cancel_btn = gr.Button("Cancel") compatible_save_btn = gr.Button("Save", variant="primary") with gr.TabItem("Leaderboard"): with gr.Row(): with gr.Column(scale=1): show_preliminary = gr.Checkbox( label="Reveal preliminary results", value=True, # Checked by default info="Show all models, including models with less human ratings (< 300 votes)", interactive=True ) stats_display = gr.Markdown() leaderboard_table = gr.Dataframe( headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"], datatype=["str", "number", "str", "number", "str", "str", "str"], ) gr.Markdown("""

Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched: [*"Together Turbo achieves this performance while maintaining full accuracy compared to Meta's reference implementation across all models. Llama-3.1-405B-Instruct-Turbo matches the accuracy of Meta reference models."*](https://www.together.ai/blog/together-inference-engine-2) """) # Add change handler for checkbox show_preliminary.change( fn=refresh_leaderboard, inputs=[show_preliminary], outputs=[leaderboard_table, stats_display] ) # Update the load event demo.load( fn=refresh_leaderboard, inputs=[show_preliminary], outputs=[leaderboard_table, stats_display] ) with gr.TabItem("Policy"): gr.Markdown(POLICY_CONTENT) gr.Markdown(ACKNOWLEDGEMENTS) # Define state variables for model tracking model_a_state = gr.State() model_b_state = gr.State() final_prompt_state = gr.State() eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value is_editing = gr.State(False) # Track editing state compatible_mode_state = gr.State(False) # Track compatible mode state # Update model names after responses are generated def update_model_names(model_a, model_b): return gr.update(value=f"*Model: {model_a}*"), gr.update( value=f"*Model: {model_b}*" ) # Store the last submitted prompt and variables for comparison last_submission = gr.State({}) # Update the vote button click handlers vote_a.click( fn=vote, inputs=[ gr.State("A"), model_a_state, model_b_state, final_prompt_state, score_a, critique_a, score_b, critique_b, ], outputs=[ vote_a, vote_b, vote_tie, model_name_a, model_name_b, send_btn, random_btn, gr.State(), # placeholder for success message ], ) vote_b.click( fn=vote, inputs=[ gr.State("B"), model_a_state, model_b_state, final_prompt_state, score_a, critique_a, score_b, critique_b, ], outputs=[ vote_a, vote_b, vote_tie, model_name_a, model_name_b, send_btn, random_btn, gr.State(), # placeholder for success message ], ) vote_tie.click( fn=vote, inputs=[ gr.State("Tie"), model_a_state, model_b_state, final_prompt_state, score_a, critique_a, score_b, critique_b, ], outputs=[ vote_a, vote_b, vote_tie, model_name_a, model_name_b, send_btn, random_btn, gr.State(), # placeholder for success message ], ) # Add handlers for save/cancel buttons def save_prompt(new_prompt, previous_prompt): return [ gr.update(value=new_prompt), # Update the prompt new_prompt, # Update the previous prompt state gr.update(visible=False) # Hide the buttons ] def cancel_prompt(previous_prompt): return [ gr.update(value=previous_prompt), # Revert to previous prompt previous_prompt, # Keep the previous prompt state gr.update(visible=False) # Hide the buttons ] def show_edit_buttons(current_value, previous_value): # Show buttons only if the current value differs from the previous value return gr.update(visible=current_value != previous_value) # Add handlers for save/cancel buttons and prompt changes save_prompt_btn.click( fn=save_prompt, inputs=[eval_prompt_editable, eval_prompt_previous], outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row] ) cancel_prompt_btn.click( fn=cancel_prompt, inputs=[eval_prompt_previous], outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row] ) eval_prompt_editable.change( fn=show_edit_buttons, inputs=[eval_prompt_editable, eval_prompt_previous], outputs=edit_buttons_row ) # Function to toggle visibility based on compatible mode def toggle_use_reference(checked): if checked: # Get new random samples with ground truth when enabling reference mode human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() return { ground_truth: gr.update(visible=True, value=ground_truth_msg), human_input: gr.update(value=human_msg), ai_response: gr.update(value=ai_msg), # Reset other UI elements score_a: gr.update(value=""), critique_a: gr.update(value=""), score_b: gr.update(value=""), critique_b: gr.update(value=""), vote_a: gr.update(interactive=False, variant="primary"), vote_b: gr.update(interactive=False, variant="primary"), vote_tie: gr.update(interactive=False, variant="primary"), model_name_a: gr.update(value="*Model: Hidden*"), model_name_b: gr.update(value="*Model: Hidden*"), random_btn: gr.update(value="🎲", variant="secondary"), } else: # Just hide ground truth when disabling reference mode return { ground_truth: gr.update(visible=False) } # Update the change handler to include all necessary outputs use_reference_toggle.change( fn=toggle_use_reference, inputs=[use_reference_toggle], outputs=[ ground_truth, human_input, ai_response, score_a, critique_a, score_b, critique_b, vote_a, vote_b, vote_tie, model_name_a, model_name_b, random_btn, ] ) # Add a new state variable to track first game first_game_state = gr.State(True) # Initialize as True # Update the submit function to use the state variable def submit_and_store( use_reference, eval_criteria_text_input, human_input, ai_response, ground_truth_input, score1_description, score2_description, score3_description, score4_description, score5_description, is_first_game, ): # Build prompt data dictionary prompt_data = { 'human_input': human_input, 'ai_response': ai_response, 'ground_truth_input': ground_truth_input, 'eval_criteria': eval_criteria_text_input, 'score1_desc': score1_description, 'score2_desc': score2_description, 'score3_desc': score3_description, 'score4_desc': score4_description, 'score5_desc': score5_description, } # Get list of active models only for matches active_models = [name for name, info in model_data.items() if info.get("active", True)] # Define new models list new_models = ["Atla-8B-preview", "Flow-Judge-0.1", "SFR-LLaMA-3.1-70B-Judge"] # add "Flow-Judge-1.0" once ready if is_first_game: # For the first game, ensure Salesforce model is one of the models to catch up on votes salesforce_model = "SFR-LLaMA-3.1-70B-Judge" other_models = [m for m in active_models if m != salesforce_model] other_model = random.choice(other_models) # Randomly assign new model to either position A or B if random.random() < 0.5: model_a, model_b = salesforce_model, other_model else: model_a, model_b = other_model, salesforce_model else: # For subsequent games, new models appears 40% of the time if random.random() < 0.4: # Randomly choose between new models new_model = random.choice(new_models) other_models = [m for m in active_models if m not in new_models] other_model = random.choice(other_models) if random.random() < 0.5: model_a, model_b = new_model, other_model else: model_a, model_b = other_model, new_model else: # For other cases, exclude both Atla and Flow-Judge non_special_models = [m for m in active_models if m not in new_models] model1, model2 = random.sample(non_special_models, 2) model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1) # Get responses from models response_a = get_model_response( model_a, model_data.get(model_a), prompt_data, use_reference=use_reference ) response_b = get_model_response( model_b, model_data.get(model_b), prompt_data, use_reference=use_reference ) # Parse the responses based on model, using appropriate parsing for different models is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus') is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus') is_atla_a = (model_data.get(model_a)['organization'] == 'Atla') is_atla_b = (model_data.get(model_b)['organization'] == 'Atla') is_flow_judge_a = (model_data.get(model_a)['organization'] == 'Flow AI') is_flow_judge_b = (model_data.get(model_b)['organization'] == 'Flow AI') is_salesforce_a = (model_data.get(model_a)['organization'] == 'Salesforce') is_salesforce_b = (model_data.get(model_b)['organization'] == 'Salesforce') if is_prometheus_a: score_a_val, critique_a_val = prometheus_parse_model_response(response_a) score_a_val = f"{score_a_val} / 5" elif is_atla_a or is_salesforce_a: # Same parser for Atla and Salesforce score_a_val, critique_a_val = atla_parse_model_response(response_a) score_a_val = f"{score_a_val} / 5" elif is_flow_judge_a: score_a_val, critique_a_val = flow_judge_parse_model_response(response_a) score_a_val = f"{score_a_val} / 5" else: score_a_val, critique_a_val = parse_model_response(response_a) score_a_val = f"{score_a_val} / 5" if is_prometheus_b: score_b_val, critique_b_val = prometheus_parse_model_response(response_b) score_b_val = f"{score_b_val} / 5" elif is_atla_b or is_salesforce_b: # Same parser for Atla and Salesforce score_b_val, critique_b_val = atla_parse_model_response(response_b) score_b_val = f"{score_b_val} / 5" elif is_flow_judge_b: score_b_val, critique_b_val = flow_judge_parse_model_response(response_b) score_b_val = f"{score_b_val} / 5" else: score_b_val, critique_b_val = parse_model_response(response_b) score_b_val = f"{score_b_val} / 5" return ( score_a_val, critique_a_val, score_b_val, critique_b_val, gr.update(interactive=True, variant="primary"), # vote_a gr.update(interactive=True, variant="primary"), # vote_b gr.update(interactive=True, variant="primary"), # vote_tie model_a, model_b, eval_prompt, gr.update(value="*Model: Hidden*"), gr.update(value="*Model: Hidden*"), gr.update(value="Regenerate judges", variant="secondary", interactive=True), gr.update(value="🎲"), # random_btn False, # Set first_game_state to False after first submission ) # Update the click handler to use False for is_first_game after first submission def create_submit_handler(): first_game = True def handler(*args): nonlocal first_game result = submit_and_store(*args, first_game) first_game = False # Set to False after first submission return result return handler # Update the send_btn click handler send_btn.click( fn=submit_and_store, inputs=[ use_reference_toggle, eval_criteria_text, human_input, ai_response, ground_truth, score1_description, score2_description, score3_description, score4_description, score5_description, first_game_state, # Add first_game_state as input ], outputs=[ score_a, critique_a, score_b, critique_b, vote_a, vote_b, vote_tie, model_a_state, model_b_state, final_prompt_state, model_name_a, model_name_b, send_btn, random_btn, first_game_state, # Add first_game_state as output ], ) # Add random button handler random_btn.click( fn=populate_random_example, inputs=[use_reference_toggle], # Use compatible mode toggle to decide behavior outputs=[ human_input, ai_response, random_btn, score_a, critique_a, score_b, critique_b, vote_a, vote_b, vote_tie, model_name_a, model_name_b, ground_truth, # Set ground truth ] ) # Add new input change handlers def handle_input_change(): """Reset UI state when inputs are changed""" return [ gr.update(interactive=False), # vote_a gr.update(interactive=False), # vote_b gr.update(interactive=False), # vote_tie gr.update(value="Run judges", variant="primary"), # send_btn gr.update(value="🎲", variant="secondary"), # random_btn ] # Update the change handlers for inputs human_input.change( fn=handle_input_change, inputs=[], outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn] ) ai_response.change( fn=handle_input_change, inputs=[], outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn] ) generate_btn.click( fn=lambda msg: ( generate_ai_response(msg)[0], # Only take the response text gr.update( value="Generate AI Response", # Keep the label interactive=False # Disable the button ) ), inputs=[human_input], outputs=[ai_response, generate_btn] ) human_input.change( fn=lambda x: gr.update(interactive=bool(x.strip())), inputs=[human_input], outputs=[generate_btn] ) # Update the demo.load to include the random example population demo.load( fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode inputs=[], outputs=[ human_input, ai_response, random_btn, score_a, critique_a, score_b, critique_b, vote_a, vote_b, vote_tie, model_name_a, model_name_b, ground_truth, ] ) # Add new state variables for compatible mode eval_criteria_previous = gr.State(value=DEFAULT_EVAL_CRITERIA) score1_previous = gr.State(value=DEFAULT_SCORE_1) score2_previous = gr.State(value=DEFAULT_SCORE_2) score3_previous = gr.State(value=DEFAULT_SCORE_3) score4_previous = gr.State(value=DEFAULT_SCORE_4) score5_previous = gr.State(value=DEFAULT_SCORE_5) # Add new functions to handle compatible mode saves/cancels def save_compatible_prompt(criteria, score1, score2, score3, score4, score5): return [ gr.update(value=criteria), # Update criteria criteria, # Update previous criteria state gr.update(value=score1), score1, gr.update(value=score2), score2, gr.update(value=score3), score3, gr.update(value=score4), score4, gr.update(value=score5), score5, gr.update(visible=False) # Hide buttons ] def cancel_compatible_prompt(prev_criteria, prev_score1, prev_score2, prev_score3, prev_score4, prev_score5): return [ gr.update(value=prev_criteria), prev_criteria, gr.update(value=prev_score1), prev_score1, gr.update(value=prev_score2), prev_score2, gr.update(value=prev_score3), prev_score3, gr.update(value=prev_score4), prev_score4, gr.update(value=prev_score5), prev_score5, gr.update(visible=False) ] def show_compatible_edit_buttons(*current_values): previous_values = current_values[1::2] # Get previous values current_values = current_values[::2] # Get current values return gr.update(visible=any(curr != prev for curr, prev in zip(current_values, previous_values))) # Add click handlers for compatible mode buttons compatible_save_btn.click( fn=save_compatible_prompt, inputs=[ eval_criteria_text, score1_description, score2_description, score3_description, score4_description, score5_description ], outputs=[ eval_criteria_text, eval_criteria_previous, score1_description, score1_previous, score2_description, score2_previous, score3_description, score3_previous, score4_description, score4_previous, score5_description, score5_previous, compatible_edit_buttons_row ] ) compatible_cancel_btn.click( fn=cancel_compatible_prompt, inputs=[ eval_criteria_previous, score1_previous, score2_previous, score3_previous, score4_previous, score5_previous ], outputs=[ eval_criteria_text, eval_criteria_previous, score1_description, score1_previous, score2_description, score2_previous, score3_description, score3_previous, score4_description, score4_previous, score5_description, score5_previous, compatible_edit_buttons_row ] ) # Add change handlers for all compatible mode inputs for component in [eval_criteria_text, score1_description, score2_description, score3_description, score4_description, score5_description]: component.change( fn=show_compatible_edit_buttons, inputs=[ eval_criteria_text, eval_criteria_previous, score1_description, score1_previous, score2_description, score2_previous, score3_description, score3_previous, score4_description, score4_previous, score5_description, score5_previous ], outputs=compatible_edit_buttons_row ) if __name__ == "__main__": demo.launch()