Terry Zhuo commited on
Commit
7ecd52b
·
1 Parent(s): 7997b03
Files changed (4) hide show
  1. Dockerfile +5 -0
  2. README.md +12 -5
  3. app.py +95 -0
  4. pyproject.toml +13 -0
Dockerfile ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ FROM bigcodebench/bigcodebench-gradio:latest
2
+ COPY . /app
3
+ EXPOSE 7860
4
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
5
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,17 @@
1
  ---
2
- title: Bigcodebench Interaction
3
- emoji: 👁
4
- colorFrom: indigo
5
  colorTo: indigo
6
  sdk: docker
 
 
7
  pinned: false
 
 
 
 
 
 
8
  ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: BigCodeBench Evaluator
3
+ emoji: 🥇
4
+ colorFrom: green
5
  colorTo: indigo
6
  sdk: docker
7
+ app_file: app.py
8
+ disable_embedding: true
9
  pinned: false
10
+ license: apache-2.0
11
+ tags:
12
+ - leaderboard
13
+ - eval:code
14
+ - test:public
15
+ - judge:auto
16
  ---
17
+ Paper:arxiv.org/abs/2406.15877
 
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import io
3
+ import sys
4
+ import logging
5
+ import multiprocessing
6
+ import os
7
+ import pickle
8
+ import threading
9
+ import time
10
+ from collections import Counter, defaultdict
11
+ from concurrent.futures import ProcessPoolExecutor, as_completed, wait, FIRST_COMPLETED
12
+ from datetime import datetime
13
+ from typing import Any, Dict, List, Tuple
14
+ from warnings import warn
15
+ from contextlib import redirect_stdout, redirect_stderr
16
+
17
+ import numpy as np
18
+ from huggingface_hub import HfApi
19
+ from bigcodebench.data.utils import CACHE_DIR
20
+ from bigcodebench.eval import PASS, compatible_eval_result, estimate_pass_at_k, untrusted_check
21
+ from bigcodebench.gen.util import trusted_check
22
+ from apscheduler.schedulers.background import BackgroundScheduler
23
+
24
+ REPO_ID = "bigcode/bigcodebench-interaction"
25
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
26
+ API = HfApi(token=HF_TOKEN)
27
+ Result = Tuple[str, List[bool]]
28
+
29
+
30
+ def run_code(code: str) -> str:
31
+ # Create string buffers to capture output
32
+ stdout_buffer = io.StringIO()
33
+ stderr_buffer = io.StringIO()
34
+
35
+ # Create a dictionary for local variables
36
+ local_dict = {}
37
+
38
+ # Capture both stdout and stderr
39
+ with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
40
+ try:
41
+ # Execute the code
42
+ exec(code, globals(), local_dict)
43
+
44
+ # Get the output
45
+ output = stdout_buffer.getvalue()
46
+ errors = stderr_buffer.getvalue()
47
+
48
+ # If there's a return value in the last expression, capture it
49
+ last_line = code.strip().split('\n')[-1]
50
+ if not (last_line.startswith('print') or last_line.strip() == ''):
51
+ try:
52
+ result = eval(last_line, globals(), local_dict)
53
+ if result is not None:
54
+ output += f"\n>>> {result}"
55
+ except:
56
+ pass
57
+
58
+ # Combine stdout and stderr
59
+ result = output
60
+ if errors:
61
+ result += "\n--- Errors ---\n" + errors
62
+
63
+ except Exception as e:
64
+ # Capture any execution errors
65
+ result = f"Error: {str(e)}"
66
+
67
+ return result if result.strip() else "Code executed successfully (no output)"
68
+
69
+ # Create the Gradio interface with better styling
70
+ interface = gr.Interface(
71
+ fn=run_code,
72
+ inputs=[
73
+ gr.Code(label="Python Code", language="python"),
74
+ ],
75
+ outputs=[
76
+ gr.Textbox(label="Output")
77
+ ],
78
+ )
79
+ interface.queue(default_concurrency_limit=None)
80
+
81
+
82
+ def restart_space():
83
+ logging.info(f"Restarting space with repo ID: {REPO_ID}")
84
+ try:
85
+ # Now restart the space
86
+ API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
87
+ logging.info("Space restarted successfully.")
88
+ except Exception as e:
89
+ logging.error(f"Failed to restart space: {e}")
90
+
91
+
92
+ scheduler = BackgroundScheduler()
93
+ scheduler.add_job(restart_space, "interval", hours=5) # Restart every 5hs
94
+ scheduler.start()
95
+ interface.launch(show_error=True)
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119