Spaces:

Svngoku
/

afrimmlu-iroko-bench-deepseek

Running

App Files Files Community

Svngoku commited on 18 days ago

Commit

f5d93aa

verified ·

1 Parent(s): f200b56

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -39

app.py CHANGED Viewed

@@ -212,10 +212,18 @@ def create_visualization(results_dict):
     return summary_df, fig
 def create_gradio_interface():
-    """
-    Create and configure the Gradio interface.
-    """
     language_options = {
         "swa": "Swahili",
         "yor": "Yoruba",
@@ -225,46 +233,78 @@ def create_gradio_interface():
         "ibo": "Igbo"
     }
-    # Initialize database
     initialize_database()
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # AfriMMLU Evaluation Dashboard
-        Select a language and model to evaluate performance on the AfriMMLU benchmark.
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                language_input = gr.Dropdown(
-                    choices=list(language_options.keys()),
-                    label="Select Language",
-                    value="swa"
-                )
-                model_input = gr.Dropdown(
-                    choices=["deepseek/deepseek-chat"],
-                    label="Select Model",
-                    value="deepseek/deepseek-chat"
-                )
-                evaluate_btn = gr.Button("Evaluate", variant="primary")
-        with gr.Row():
-            with gr.Column():
-                summary_table = gr.Dataframe(
-                    headers=["Subject", "Accuracy (%)"],
-                    label="Summary Results"
-                )
-        with gr.Row():
-            with gr.Column():
-                summary_plot = gr.Plot(label="Performance by Subject")
-        with gr.Row():
-            with gr.Column():
-                detailed_results = gr.Dataframe(
-                    label="Detailed Results",
-                    wrap=True
-                )
         def evaluate_language(language_code, model_name):
             test_data = load_afrimmlu_data(language_code)
@@ -278,14 +318,31 @@ def create_gradio_interface():
             return summary_df, plot, detailed_df
         evaluate_btn.click(
             fn=evaluate_language,
             inputs=[language_input, model_input],
             outputs=[summary_table, summary_plot, detailed_results]
         )
     return demo
 if __name__ == "__main__":
     os.environ['DEEPSEEK_API_KEY']
     os.environ['HF_TOKEN']

     return summary_df, fig
+def query_database(query):
+    conn = sqlite3.connect('afrimmlu_results.db')
+    try:
+        df = pd.read_sql_query(query, conn)
+        return df
+    except Exception as e:
+        return pd.DataFrame({'Error': [str(e)]})
+    finally:
+        conn.close()
 def create_gradio_interface():
     language_options = {
         "swa": "Swahili",
         "yor": "Yoruba",
         "ibo": "Igbo"
     }
     initialize_database()
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# AfriMMLU Evaluation Dashboard")
+        with gr.Tabs():
+            # Evaluation Tab
+            with gr.Tab("Model Evaluation"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        language_input = gr.Dropdown(
+                            choices=list(language_options.keys()),
+                            label="Select Language",
+                            value="swa"
+                        )
+                        model_input = gr.Dropdown(
+                            choices=["deepseek/deepseek-chat"],
+                            label="Select Model",
+                            value="deepseek/deepseek-chat"
+                        )
+                        evaluate_btn = gr.Button("Evaluate", variant="primary")
+                with gr.Row():
+                    summary_table = gr.Dataframe(
+                        headers=["Subject", "Accuracy (%)"],
+                        label="Summary Results"
+                    )
+                with gr.Row():
+                    summary_plot = gr.Plot(label="Performance by Subject")
+                with gr.Row():
+                    detailed_results = gr.Dataframe(
+                        label="Detailed Results",
+                        wrap=True
+                    )
+            # Query Tab
+            with gr.Tab("Database Analysis"):
+                with gr.Row():
+                    with gr.Column():
+                        example_queries = gr.Dropdown(
+                            choices=[
+                                "SELECT language, AVG(accuracy) as avg_accuracy FROM summary_results WHERE subject='Overall' GROUP BY language",
+                                "SELECT subject, AVG(accuracy) as avg_accuracy FROM summary_results GROUP BY subject",
+                                "SELECT language, subject, accuracy, timestamp FROM summary_results ORDER BY timestamp DESC LIMIT 10",
+                                "SELECT language, COUNT(*) as total_questions, SUM(is_correct) as correct_answers FROM detailed_results GROUP BY language",
+                                "SELECT subject, COUNT(*) as total_evaluations FROM summary_results GROUP BY subject"
+                            ],
+                            label="Example Queries",
+                            value="SELECT language, AVG(accuracy) as avg_accuracy FROM summary_results WHERE subject='Overall' GROUP BY language"
+                        )
+                        query_input = gr.Textbox(
+                            label="SQL Query",
+                            placeholder="Enter your SQL query here",
+                            lines=3
+                        )
+                        query_button = gr.Button("Run Query", variant="primary")
+                        gr.Markdown("""
+                        ### Available Tables:
+                        1. summary_results (id, language, subject, accuracy, timestamp)
+                        2. detailed_results (id, language, timestamp, subject, question, model_answer, correct_answer, is_correct, total_tokens)
+                        """)
+                with gr.Row():
+                    query_output = gr.Dataframe(
+                        label="Query Results",
+                        wrap=True
+                    )
         def evaluate_language(language_code, model_name):
             test_data = load_afrimmlu_data(language_code)
             return summary_df, plot, detailed_df
+        # Evaluation tab callback
         evaluate_btn.click(
             fn=evaluate_language,
             inputs=[language_input, model_input],
             outputs=[summary_table, summary_plot, detailed_results]
         )
+        # Query tab callbacks
+        example_queries.change(
+            fn=lambda x: x,
+            inputs=[example_queries],
+            outputs=[query_input]
+        )
+        query_button.click(
+            fn=query_database,
+            inputs=[query_input],
+            outputs=[query_output]
+        )
     return demo
 if __name__ == "__main__":
     os.environ['DEEPSEEK_API_KEY']
     os.environ['HF_TOKEN']