Spaces:

Svngoku
/

afrimmlu-iroko-bench-deepseek

Running

File size: 7,041 Bytes

import ast
import pandas as pd
import gradio as gr
import litellm
import plotly.express as px
from collections import defaultdict
from datetime import datetime
import os



os.environ['DEEPSEEK_API_KEY']

def preprocess_dataset(test_data):
    """
    Preprocess the dataset to convert the 'choices' field from a string to a list of strings.
    """
    preprocessed_data = []
    for example in test_data:
        if isinstance(example['choices'], str):
            choices_str = example['choices']
            if choices_str.startswith("'") and choices_str.endswith("'"):
                choices_str = choices_str[1:-1]
            elif choices_str.startswith('"') and choices_str.endswith('"'):
                choices_str = choices_str[1:-1]
            choices_str = choices_str.replace("\\'", "'")
            try:
                example['choices'] = ast.literal_eval(choices_str)
            except (ValueError, SyntaxError):
                print(f"Error parsing choices: {choices_str}")
                continue
        preprocessed_data.append(example)
    return preprocessed_data

def evaluate_afrimmlu(test_data, model_name="deepseek-chat"):
    """
    Evaluate the model on the AfriMMLU dataset.
    """
    results = []
    correct = 0
    total = 0
    subject_results = defaultdict(lambda: {"correct": 0, "total": 0})

    for example in test_data:
        question = example['question']
        choices = example['choices']
        answer = example['answer']
        subject = example['subject']

        prompt = (
            f"Answer the following multiple-choice question. "
            f"Return only the letter corresponding to the correct answer (A, B, C, or D).\n"
            f"Question: {question}\n"
            f"Options:\n"
            f"A. {choices[0]}\n"
            f"B. {choices[1]}\n"
            f"C. {choices[2]}\n"
            f"D. {choices[3]}\n"
            f"Answer:"
        )

        try:
            response = litellm.completion(
                model=model_name,
                messages=[{"role": "user", "content": prompt}]
            )
            model_output = response.choices[0].message.content.strip().upper()
            
            model_answer = None
            for char in model_output:
                if char in ['A', 'B', 'C', 'D']:
                    model_answer = char
                    break

            is_correct = model_answer == answer.upper()
            if is_correct:
                correct += 1
                subject_results[subject]["correct"] += 1
            total += 1
            subject_results[subject]["total"] += 1

            # Store detailed results
            results.append({
                'timestamp': datetime.now().isoformat(),
                'subject': subject,
                'question': question,
                'model_answer': model_answer,
                'correct_answer': answer.upper(),
                'is_correct': is_correct,
                'total_tokens': response.usage.total_tokens
            })

        except Exception as e:
            print(f"Error processing question: {str(e)}")
            continue

    # Calculate accuracies
    accuracy = (correct / total * 100) if total > 0 else 0
    subject_accuracy = {
        subject: (stats["correct"] / stats["total"] * 100) if stats["total"] > 0 else 0
        for subject, stats in subject_results.items()
    }

    # Export results to CSV
    df = pd.DataFrame(results)
    df.to_csv('detailed_results.csv', index=False)

    # Export summary to CSV
    summary_data = [{'subject': subject, 'accuracy': acc} 
                   for subject, acc in subject_accuracy.items()]
    summary_data.append({'subject': 'Overall', 'accuracy': accuracy})
    pd.DataFrame(summary_data).to_csv('summary_results.csv', index=False)

    return {
        "accuracy": accuracy,
        "subject_accuracy": subject_accuracy,
        "detailed_results": results
    }

def create_visualization(results_dict):
    """
    Create visualization from evaluation results.
    """
    summary_data = [
        {'Subject': subject, 'Accuracy (%)': accuracy} 
        for subject, accuracy in results_dict['subject_accuracy'].items()
    ]
    summary_data.append({'Subject': 'Overall', 'Accuracy (%)': results_dict['accuracy']})
    summary_df = pd.DataFrame(summary_data)
    
    fig = px.bar(
        summary_df,
        x='Subject',
        y='Accuracy (%)',
        title='AfriMMLU Evaluation Results',
        labels={'Subject': 'Subject', 'Accuracy (%)': 'Accuracy (%)'}
    )
    fig.update_layout(
        xaxis_tickangle=-45,
        showlegend=False,
        height=600
    )
    
    return summary_df, fig

def evaluate_and_display(test_file, model_name):
    # Load and preprocess data
    test_data = pd.read_json(test_file.name)
    preprocessed_data = preprocess_dataset(test_data.to_dict('records'))
    
    # Run evaluation
    results = evaluate_afrimmlu(preprocessed_data, model_name)
    
    # Create visualizations
    summary_df, plot = create_visualization(results)
    
    # Load detailed results with error handling
    try:
        detailed_df = pd.read_csv('detailed_results.csv')
    except (FileNotFoundError, pd.errors.EmptyDataError):
        detailed_df = pd.DataFrame(results["detailed_results"])
    
    return summary_df, plot, detailed_df


def create_gradio_interface():
    """
    Create and configure the Gradio interface.
    """
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # AfriMMLU Evaluation Dashboard
        Upload your test data and select a model to evaluate performance on the AfriMMLU benchmark.
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                file_input = gr.File(
                    label="Upload Test Data (JSON)",
                    file_types=[".json"]
                )
                model_input = gr.Dropdown(
                    choices=["deepseek/deepseek-chat"],
                    label="Select Model",
                    value="deepseek/deepseek-chat"
                )
                evaluate_btn = gr.Button("Evaluate", variant="primary")
        
        with gr.Row():
            with gr.Column():
                summary_table = gr.Dataframe(
                    headers=["Subject", "Accuracy (%)"],
                    label="Summary Results"
                )
        
        with gr.Row():
            with gr.Column():
                summary_plot = gr.Plot(label="Performance by Subject")
        
        with gr.Row():
            with gr.Column():
                detailed_results = gr.Dataframe(
                    label="Detailed Results",
                    wrap=True
                )
        
        evaluate_btn.click(
            fn=evaluate_and_display,
            inputs=[file_input, model_input],
            outputs=[summary_table, summary_plot, detailed_results]
        )
    
    return demo

if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch(share=True)