Update app.py
Browse files
app.py
CHANGED
@@ -212,10 +212,18 @@ def create_visualization(results_dict):
|
|
212 |
|
213 |
return summary_df, fig
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
def create_gradio_interface():
|
216 |
-
"""
|
217 |
-
Create and configure the Gradio interface.
|
218 |
-
"""
|
219 |
language_options = {
|
220 |
"swa": "Swahili",
|
221 |
"yor": "Yoruba",
|
@@ -225,46 +233,78 @@ def create_gradio_interface():
|
|
225 |
"ibo": "Igbo"
|
226 |
}
|
227 |
|
228 |
-
# Initialize database
|
229 |
initialize_database()
|
230 |
|
231 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
232 |
-
gr.Markdown(""
|
233 |
-
# AfriMMLU Evaluation Dashboard
|
234 |
-
Select a language and model to evaluate performance on the AfriMMLU benchmark.
|
235 |
-
""")
|
236 |
-
|
237 |
-
with gr.Row():
|
238 |
-
with gr.Column(scale=1):
|
239 |
-
language_input = gr.Dropdown(
|
240 |
-
choices=list(language_options.keys()),
|
241 |
-
label="Select Language",
|
242 |
-
value="swa"
|
243 |
-
)
|
244 |
-
model_input = gr.Dropdown(
|
245 |
-
choices=["deepseek/deepseek-chat"],
|
246 |
-
label="Select Model",
|
247 |
-
value="deepseek/deepseek-chat"
|
248 |
-
)
|
249 |
-
evaluate_btn = gr.Button("Evaluate", variant="primary")
|
250 |
-
|
251 |
-
with gr.Row():
|
252 |
-
with gr.Column():
|
253 |
-
summary_table = gr.Dataframe(
|
254 |
-
headers=["Subject", "Accuracy (%)"],
|
255 |
-
label="Summary Results"
|
256 |
-
)
|
257 |
|
258 |
-
with gr.
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
|
269 |
def evaluate_language(language_code, model_name):
|
270 |
test_data = load_afrimmlu_data(language_code)
|
@@ -278,14 +318,31 @@ def create_gradio_interface():
|
|
278 |
|
279 |
return summary_df, plot, detailed_df
|
280 |
|
|
|
|
|
281 |
evaluate_btn.click(
|
282 |
fn=evaluate_language,
|
283 |
inputs=[language_input, model_input],
|
284 |
outputs=[summary_table, summary_plot, detailed_results]
|
285 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
|
287 |
return demo
|
288 |
|
|
|
|
|
289 |
if __name__ == "__main__":
|
290 |
os.environ['DEEPSEEK_API_KEY']
|
291 |
os.environ['HF_TOKEN']
|
|
|
212 |
|
213 |
return summary_df, fig
|
214 |
|
215 |
+
|
216 |
+
def query_database(query):
|
217 |
+
conn = sqlite3.connect('afrimmlu_results.db')
|
218 |
+
try:
|
219 |
+
df = pd.read_sql_query(query, conn)
|
220 |
+
return df
|
221 |
+
except Exception as e:
|
222 |
+
return pd.DataFrame({'Error': [str(e)]})
|
223 |
+
finally:
|
224 |
+
conn.close()
|
225 |
+
|
226 |
def create_gradio_interface():
|
|
|
|
|
|
|
227 |
language_options = {
|
228 |
"swa": "Swahili",
|
229 |
"yor": "Yoruba",
|
|
|
233 |
"ibo": "Igbo"
|
234 |
}
|
235 |
|
|
|
236 |
initialize_database()
|
237 |
|
238 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
239 |
+
gr.Markdown("# AfriMMLU Evaluation Dashboard")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
+
with gr.Tabs():
|
242 |
+
# Evaluation Tab
|
243 |
+
with gr.Tab("Model Evaluation"):
|
244 |
+
with gr.Row():
|
245 |
+
with gr.Column(scale=1):
|
246 |
+
language_input = gr.Dropdown(
|
247 |
+
choices=list(language_options.keys()),
|
248 |
+
label="Select Language",
|
249 |
+
value="swa"
|
250 |
+
)
|
251 |
+
model_input = gr.Dropdown(
|
252 |
+
choices=["deepseek/deepseek-chat"],
|
253 |
+
label="Select Model",
|
254 |
+
value="deepseek/deepseek-chat"
|
255 |
+
)
|
256 |
+
evaluate_btn = gr.Button("Evaluate", variant="primary")
|
257 |
+
|
258 |
+
with gr.Row():
|
259 |
+
summary_table = gr.Dataframe(
|
260 |
+
headers=["Subject", "Accuracy (%)"],
|
261 |
+
label="Summary Results"
|
262 |
+
)
|
263 |
+
|
264 |
+
with gr.Row():
|
265 |
+
summary_plot = gr.Plot(label="Performance by Subject")
|
266 |
+
|
267 |
+
with gr.Row():
|
268 |
+
detailed_results = gr.Dataframe(
|
269 |
+
label="Detailed Results",
|
270 |
+
wrap=True
|
271 |
+
)
|
272 |
+
|
273 |
+
# Query Tab
|
274 |
+
with gr.Tab("Database Analysis"):
|
275 |
+
with gr.Row():
|
276 |
+
with gr.Column():
|
277 |
+
example_queries = gr.Dropdown(
|
278 |
+
choices=[
|
279 |
+
"SELECT language, AVG(accuracy) as avg_accuracy FROM summary_results WHERE subject='Overall' GROUP BY language",
|
280 |
+
"SELECT subject, AVG(accuracy) as avg_accuracy FROM summary_results GROUP BY subject",
|
281 |
+
"SELECT language, subject, accuracy, timestamp FROM summary_results ORDER BY timestamp DESC LIMIT 10",
|
282 |
+
"SELECT language, COUNT(*) as total_questions, SUM(is_correct) as correct_answers FROM detailed_results GROUP BY language",
|
283 |
+
"SELECT subject, COUNT(*) as total_evaluations FROM summary_results GROUP BY subject"
|
284 |
+
],
|
285 |
+
label="Example Queries",
|
286 |
+
value="SELECT language, AVG(accuracy) as avg_accuracy FROM summary_results WHERE subject='Overall' GROUP BY language"
|
287 |
+
)
|
288 |
+
|
289 |
+
query_input = gr.Textbox(
|
290 |
+
label="SQL Query",
|
291 |
+
placeholder="Enter your SQL query here",
|
292 |
+
lines=3
|
293 |
+
)
|
294 |
+
|
295 |
+
query_button = gr.Button("Run Query", variant="primary")
|
296 |
+
|
297 |
+
gr.Markdown("""
|
298 |
+
### Available Tables:
|
299 |
+
1. summary_results (id, language, subject, accuracy, timestamp)
|
300 |
+
2. detailed_results (id, language, timestamp, subject, question, model_answer, correct_answer, is_correct, total_tokens)
|
301 |
+
""")
|
302 |
+
|
303 |
+
with gr.Row():
|
304 |
+
query_output = gr.Dataframe(
|
305 |
+
label="Query Results",
|
306 |
+
wrap=True
|
307 |
+
)
|
308 |
|
309 |
def evaluate_language(language_code, model_name):
|
310 |
test_data = load_afrimmlu_data(language_code)
|
|
|
318 |
|
319 |
return summary_df, plot, detailed_df
|
320 |
|
321 |
+
|
322 |
+
# Evaluation tab callback
|
323 |
evaluate_btn.click(
|
324 |
fn=evaluate_language,
|
325 |
inputs=[language_input, model_input],
|
326 |
outputs=[summary_table, summary_plot, detailed_results]
|
327 |
)
|
328 |
+
|
329 |
+
# Query tab callbacks
|
330 |
+
example_queries.change(
|
331 |
+
fn=lambda x: x,
|
332 |
+
inputs=[example_queries],
|
333 |
+
outputs=[query_input]
|
334 |
+
)
|
335 |
+
|
336 |
+
query_button.click(
|
337 |
+
fn=query_database,
|
338 |
+
inputs=[query_input],
|
339 |
+
outputs=[query_output]
|
340 |
+
)
|
341 |
|
342 |
return demo
|
343 |
|
344 |
+
|
345 |
+
|
346 |
if __name__ == "__main__":
|
347 |
os.environ['DEEPSEEK_API_KEY']
|
348 |
os.environ['HF_TOKEN']
|