|
import ast |
|
import pandas as pd |
|
import gradio as gr |
|
import litellm |
|
import plotly.express as px |
|
from collections import defaultdict |
|
from datetime import datetime |
|
import os |
|
|
|
|
|
|
|
os.environ['DEEPSEEK_API_KEY'] |
|
|
|
def preprocess_dataset(test_data): |
|
""" |
|
Preprocess the dataset to convert the 'choices' field from a string to a list of strings. |
|
""" |
|
preprocessed_data = [] |
|
for example in test_data: |
|
if isinstance(example['choices'], str): |
|
choices_str = example['choices'] |
|
if choices_str.startswith("'") and choices_str.endswith("'"): |
|
choices_str = choices_str[1:-1] |
|
elif choices_str.startswith('"') and choices_str.endswith('"'): |
|
choices_str = choices_str[1:-1] |
|
choices_str = choices_str.replace("\\'", "'") |
|
try: |
|
example['choices'] = ast.literal_eval(choices_str) |
|
except (ValueError, SyntaxError): |
|
print(f"Error parsing choices: {choices_str}") |
|
continue |
|
preprocessed_data.append(example) |
|
return preprocessed_data |
|
|
|
def evaluate_afrimmlu(test_data, model_name="deepseek-chat"): |
|
""" |
|
Evaluate the model on the AfriMMLU dataset. |
|
""" |
|
results = [] |
|
correct = 0 |
|
total = 0 |
|
subject_results = defaultdict(lambda: {"correct": 0, "total": 0}) |
|
|
|
for example in test_data: |
|
question = example['question'] |
|
choices = example['choices'] |
|
answer = example['answer'] |
|
subject = example['subject'] |
|
|
|
prompt = ( |
|
f"Answer the following multiple-choice question. " |
|
f"Return only the letter corresponding to the correct answer (A, B, C, or D).\n" |
|
f"Question: {question}\n" |
|
f"Options:\n" |
|
f"A. {choices[0]}\n" |
|
f"B. {choices[1]}\n" |
|
f"C. {choices[2]}\n" |
|
f"D. {choices[3]}\n" |
|
f"Answer:" |
|
) |
|
|
|
try: |
|
response = litellm.completion( |
|
model=model_name, |
|
messages=[{"role": "user", "content": prompt}] |
|
) |
|
model_output = response.choices[0].message.content.strip().upper() |
|
|
|
model_answer = None |
|
for char in model_output: |
|
if char in ['A', 'B', 'C', 'D']: |
|
model_answer = char |
|
break |
|
|
|
is_correct = model_answer == answer.upper() |
|
if is_correct: |
|
correct += 1 |
|
subject_results[subject]["correct"] += 1 |
|
total += 1 |
|
subject_results[subject]["total"] += 1 |
|
|
|
|
|
results.append({ |
|
'timestamp': datetime.now().isoformat(), |
|
'subject': subject, |
|
'question': question, |
|
'model_answer': model_answer, |
|
'correct_answer': answer.upper(), |
|
'is_correct': is_correct, |
|
'total_tokens': response.usage.total_tokens |
|
}) |
|
|
|
except Exception as e: |
|
print(f"Error processing question: {str(e)}") |
|
continue |
|
|
|
|
|
accuracy = (correct / total * 100) if total > 0 else 0 |
|
subject_accuracy = { |
|
subject: (stats["correct"] / stats["total"] * 100) if stats["total"] > 0 else 0 |
|
for subject, stats in subject_results.items() |
|
} |
|
|
|
|
|
df = pd.DataFrame(results) |
|
df.to_csv('detailed_results.csv', index=False) |
|
|
|
|
|
summary_data = [{'subject': subject, 'accuracy': acc} |
|
for subject, acc in subject_accuracy.items()] |
|
summary_data.append({'subject': 'Overall', 'accuracy': accuracy}) |
|
pd.DataFrame(summary_data).to_csv('summary_results.csv', index=False) |
|
|
|
return { |
|
"accuracy": accuracy, |
|
"subject_accuracy": subject_accuracy, |
|
"detailed_results": results |
|
} |
|
|
|
def create_visualization(results_dict): |
|
""" |
|
Create visualization from evaluation results. |
|
""" |
|
summary_data = [ |
|
{'Subject': subject, 'Accuracy (%)': accuracy} |
|
for subject, accuracy in results_dict['subject_accuracy'].items() |
|
] |
|
summary_data.append({'Subject': 'Overall', 'Accuracy (%)': results_dict['accuracy']}) |
|
summary_df = pd.DataFrame(summary_data) |
|
|
|
fig = px.bar( |
|
summary_df, |
|
x='Subject', |
|
y='Accuracy (%)', |
|
title='AfriMMLU Evaluation Results', |
|
labels={'Subject': 'Subject', 'Accuracy (%)': 'Accuracy (%)'} |
|
) |
|
fig.update_layout( |
|
xaxis_tickangle=-45, |
|
showlegend=False, |
|
height=600 |
|
) |
|
|
|
return summary_df, fig |
|
|
|
def evaluate_and_display(test_file, model_name): |
|
|
|
test_data = pd.read_json(test_file.name) |
|
preprocessed_data = preprocess_dataset(test_data.to_dict('records')) |
|
|
|
|
|
results = evaluate_afrimmlu(preprocessed_data, model_name) |
|
|
|
|
|
summary_df, plot = create_visualization(results) |
|
|
|
|
|
try: |
|
detailed_df = pd.read_csv('detailed_results.csv') |
|
except (FileNotFoundError, pd.errors.EmptyDataError): |
|
detailed_df = pd.DataFrame(results["detailed_results"]) |
|
|
|
return summary_df, plot, detailed_df |
|
|
|
|
|
def create_gradio_interface(): |
|
""" |
|
Create and configure the Gradio interface. |
|
""" |
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
gr.Markdown(""" |
|
# AfriMMLU Evaluation Dashboard |
|
Upload your test data and select a model to evaluate performance on the AfriMMLU benchmark. |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
file_input = gr.File( |
|
label="Upload Test Data (JSON)", |
|
file_types=[".json"] |
|
) |
|
model_input = gr.Dropdown( |
|
choices=["deepseek/deepseek-chat"], |
|
label="Select Model", |
|
value="deepseek/deepseek-chat" |
|
) |
|
evaluate_btn = gr.Button("Evaluate", variant="primary") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
summary_table = gr.Dataframe( |
|
headers=["Subject", "Accuracy (%)"], |
|
label="Summary Results" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
summary_plot = gr.Plot(label="Performance by Subject") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
detailed_results = gr.Dataframe( |
|
label="Detailed Results", |
|
wrap=True |
|
) |
|
|
|
evaluate_btn.click( |
|
fn=evaluate_and_display, |
|
inputs=[file_input, model_input], |
|
outputs=[summary_table, summary_plot, detailed_results] |
|
) |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
demo = create_gradio_interface() |
|
demo.launch(share=True) |
|
|