Svngoku commited on
Commit
f5d93aa
·
verified ·
1 Parent(s): f200b56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -39
app.py CHANGED
@@ -212,10 +212,18 @@ def create_visualization(results_dict):
212
 
213
  return summary_df, fig
214
 
 
 
 
 
 
 
 
 
 
 
 
215
  def create_gradio_interface():
216
- """
217
- Create and configure the Gradio interface.
218
- """
219
  language_options = {
220
  "swa": "Swahili",
221
  "yor": "Yoruba",
@@ -225,46 +233,78 @@ def create_gradio_interface():
225
  "ibo": "Igbo"
226
  }
227
 
228
- # Initialize database
229
  initialize_database()
230
 
231
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
232
- gr.Markdown("""
233
- # AfriMMLU Evaluation Dashboard
234
- Select a language and model to evaluate performance on the AfriMMLU benchmark.
235
- """)
236
-
237
- with gr.Row():
238
- with gr.Column(scale=1):
239
- language_input = gr.Dropdown(
240
- choices=list(language_options.keys()),
241
- label="Select Language",
242
- value="swa"
243
- )
244
- model_input = gr.Dropdown(
245
- choices=["deepseek/deepseek-chat"],
246
- label="Select Model",
247
- value="deepseek/deepseek-chat"
248
- )
249
- evaluate_btn = gr.Button("Evaluate", variant="primary")
250
-
251
- with gr.Row():
252
- with gr.Column():
253
- summary_table = gr.Dataframe(
254
- headers=["Subject", "Accuracy (%)"],
255
- label="Summary Results"
256
- )
257
 
258
- with gr.Row():
259
- with gr.Column():
260
- summary_plot = gr.Plot(label="Performance by Subject")
261
-
262
- with gr.Row():
263
- with gr.Column():
264
- detailed_results = gr.Dataframe(
265
- label="Detailed Results",
266
- wrap=True
267
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
  def evaluate_language(language_code, model_name):
270
  test_data = load_afrimmlu_data(language_code)
@@ -278,14 +318,31 @@ def create_gradio_interface():
278
 
279
  return summary_df, plot, detailed_df
280
 
 
 
281
  evaluate_btn.click(
282
  fn=evaluate_language,
283
  inputs=[language_input, model_input],
284
  outputs=[summary_table, summary_plot, detailed_results]
285
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
  return demo
288
 
 
 
289
  if __name__ == "__main__":
290
  os.environ['DEEPSEEK_API_KEY']
291
  os.environ['HF_TOKEN']
 
212
 
213
  return summary_df, fig
214
 
215
+
216
+ def query_database(query):
217
+ conn = sqlite3.connect('afrimmlu_results.db')
218
+ try:
219
+ df = pd.read_sql_query(query, conn)
220
+ return df
221
+ except Exception as e:
222
+ return pd.DataFrame({'Error': [str(e)]})
223
+ finally:
224
+ conn.close()
225
+
226
  def create_gradio_interface():
 
 
 
227
  language_options = {
228
  "swa": "Swahili",
229
  "yor": "Yoruba",
 
233
  "ibo": "Igbo"
234
  }
235
 
 
236
  initialize_database()
237
 
238
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
239
+ gr.Markdown("# AfriMMLU Evaluation Dashboard")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
+ with gr.Tabs():
242
+ # Evaluation Tab
243
+ with gr.Tab("Model Evaluation"):
244
+ with gr.Row():
245
+ with gr.Column(scale=1):
246
+ language_input = gr.Dropdown(
247
+ choices=list(language_options.keys()),
248
+ label="Select Language",
249
+ value="swa"
250
+ )
251
+ model_input = gr.Dropdown(
252
+ choices=["deepseek/deepseek-chat"],
253
+ label="Select Model",
254
+ value="deepseek/deepseek-chat"
255
+ )
256
+ evaluate_btn = gr.Button("Evaluate", variant="primary")
257
+
258
+ with gr.Row():
259
+ summary_table = gr.Dataframe(
260
+ headers=["Subject", "Accuracy (%)"],
261
+ label="Summary Results"
262
+ )
263
+
264
+ with gr.Row():
265
+ summary_plot = gr.Plot(label="Performance by Subject")
266
+
267
+ with gr.Row():
268
+ detailed_results = gr.Dataframe(
269
+ label="Detailed Results",
270
+ wrap=True
271
+ )
272
+
273
+ # Query Tab
274
+ with gr.Tab("Database Analysis"):
275
+ with gr.Row():
276
+ with gr.Column():
277
+ example_queries = gr.Dropdown(
278
+ choices=[
279
+ "SELECT language, AVG(accuracy) as avg_accuracy FROM summary_results WHERE subject='Overall' GROUP BY language",
280
+ "SELECT subject, AVG(accuracy) as avg_accuracy FROM summary_results GROUP BY subject",
281
+ "SELECT language, subject, accuracy, timestamp FROM summary_results ORDER BY timestamp DESC LIMIT 10",
282
+ "SELECT language, COUNT(*) as total_questions, SUM(is_correct) as correct_answers FROM detailed_results GROUP BY language",
283
+ "SELECT subject, COUNT(*) as total_evaluations FROM summary_results GROUP BY subject"
284
+ ],
285
+ label="Example Queries",
286
+ value="SELECT language, AVG(accuracy) as avg_accuracy FROM summary_results WHERE subject='Overall' GROUP BY language"
287
+ )
288
+
289
+ query_input = gr.Textbox(
290
+ label="SQL Query",
291
+ placeholder="Enter your SQL query here",
292
+ lines=3
293
+ )
294
+
295
+ query_button = gr.Button("Run Query", variant="primary")
296
+
297
+ gr.Markdown("""
298
+ ### Available Tables:
299
+ 1. summary_results (id, language, subject, accuracy, timestamp)
300
+ 2. detailed_results (id, language, timestamp, subject, question, model_answer, correct_answer, is_correct, total_tokens)
301
+ """)
302
+
303
+ with gr.Row():
304
+ query_output = gr.Dataframe(
305
+ label="Query Results",
306
+ wrap=True
307
+ )
308
 
309
  def evaluate_language(language_code, model_name):
310
  test_data = load_afrimmlu_data(language_code)
 
318
 
319
  return summary_df, plot, detailed_df
320
 
321
+
322
+ # Evaluation tab callback
323
  evaluate_btn.click(
324
  fn=evaluate_language,
325
  inputs=[language_input, model_input],
326
  outputs=[summary_table, summary_plot, detailed_results]
327
  )
328
+
329
+ # Query tab callbacks
330
+ example_queries.change(
331
+ fn=lambda x: x,
332
+ inputs=[example_queries],
333
+ outputs=[query_input]
334
+ )
335
+
336
+ query_button.click(
337
+ fn=query_database,
338
+ inputs=[query_input],
339
+ outputs=[query_output]
340
+ )
341
 
342
  return demo
343
 
344
+
345
+
346
  if __name__ == "__main__":
347
  os.environ['DEEPSEEK_API_KEY']
348
  os.environ['HF_TOKEN']