jjkim
commited on
Commit
·
ad5b61a
1
Parent(s):
e12251f
add ignore assertion errors option
Browse files- code_eval.py +15 -3
- execute.py +31 -8
code_eval.py
CHANGED
@@ -168,6 +168,7 @@ class CodeEval(evaluate.Metric):
|
|
168 |
timeout=3.0,
|
169 |
early_stop=False,
|
170 |
disable_tqdm=False,
|
|
|
171 |
):
|
172 |
"""Returns the scores"""
|
173 |
|
@@ -184,7 +185,11 @@ class CodeEval(evaluate.Metric):
|
|
184 |
for tid, pred, ref in zip(ids, predictions, references):
|
185 |
results[tid] = []
|
186 |
for pid, p in enumerate(pred):
|
187 |
-
result = Result(
|
|
|
|
|
|
|
|
|
188 |
body = Template(pred_template).safe_substitute(prediction=p)
|
189 |
for r in ref:
|
190 |
assert isinstance(r, str)
|
@@ -192,7 +197,13 @@ class CodeEval(evaluate.Metric):
|
|
192 |
test = Template(test).safe_substitute(prediction=p)
|
193 |
|
194 |
test_program = body + "\n" + test
|
195 |
-
args = (
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
future = executor.submit(check_correctness, *args)
|
197 |
result.test_program.append(test_program)
|
198 |
result.add(future)
|
@@ -263,7 +274,8 @@ def estimate_pass_at_k(num_samples, num_correct, k):
|
|
263 |
class Result(BaseModel):
|
264 |
task_id: str
|
265 |
prediction_id: int
|
266 |
-
|
|
|
267 |
test_program: List[str] = []
|
268 |
passed: Optional[bool] = None
|
269 |
result: List[str] = []
|
|
|
168 |
timeout=3.0,
|
169 |
early_stop=False,
|
170 |
disable_tqdm=False,
|
171 |
+
ignore_assertion_errors=False,
|
172 |
):
|
173 |
"""Returns the scores"""
|
174 |
|
|
|
185 |
for tid, pred, ref in zip(ids, predictions, references):
|
186 |
results[tid] = []
|
187 |
for pid, p in enumerate(pred):
|
188 |
+
result = Result(
|
189 |
+
task_id=tid,
|
190 |
+
prediction_id=pid,
|
191 |
+
ignore_assertion_error=ignore_assertion_errors,
|
192 |
+
)
|
193 |
body = Template(pred_template).safe_substitute(prediction=p)
|
194 |
for r in ref:
|
195 |
assert isinstance(r, str)
|
|
|
197 |
test = Template(test).safe_substitute(prediction=p)
|
198 |
|
199 |
test_program = body + "\n" + test
|
200 |
+
args = (
|
201 |
+
test_program,
|
202 |
+
timeout,
|
203 |
+
tid,
|
204 |
+
pid,
|
205 |
+
ignore_assertion_errors,
|
206 |
+
)
|
207 |
future = executor.submit(check_correctness, *args)
|
208 |
result.test_program.append(test_program)
|
209 |
result.add(future)
|
|
|
274 |
class Result(BaseModel):
|
275 |
task_id: str
|
276 |
prediction_id: int
|
277 |
+
ignore_assertion_error: bool = False
|
278 |
+
|
279 |
test_program: List[str] = []
|
280 |
passed: Optional[bool] = None
|
281 |
result: List[str] = []
|
execute.py
CHANGED
@@ -25,7 +25,13 @@ import signal
|
|
25 |
import tempfile
|
26 |
|
27 |
|
28 |
-
def check_correctness(
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
"""
|
30 |
Evaluates the functional correctness of a completion by running the test
|
31 |
suite provided in the problem.
|
@@ -36,7 +42,10 @@ def check_correctness(check_program, timeout, task_id, completion_id):
|
|
36 |
manager = multiprocessing.Manager()
|
37 |
result = manager.list()
|
38 |
|
39 |
-
p = multiprocessing.Process(
|
|
|
|
|
|
|
40 |
p.start()
|
41 |
p.join(timeout=timeout + 1)
|
42 |
if p.is_alive():
|
@@ -53,10 +62,13 @@ def check_correctness(check_program, timeout, task_id, completion_id):
|
|
53 |
)
|
54 |
|
55 |
|
56 |
-
def unsafe_execute(
|
57 |
-
|
|
|
|
|
|
|
|
|
58 |
with create_tempdir():
|
59 |
-
|
60 |
# These system calls are needed when cleaning up tempdir.
|
61 |
import os
|
62 |
import shutil
|
@@ -77,6 +89,11 @@ def unsafe_execute(check_program, result, timeout):
|
|
77 |
result.append("passed")
|
78 |
except TimeoutException:
|
79 |
result.append("timed out")
|
|
|
|
|
|
|
|
|
|
|
80 |
except BaseException as e:
|
81 |
result.append(f"failed: {e}")
|
82 |
|
@@ -171,10 +188,16 @@ def reliability_guard(maximum_memory_bytes=None):
|
|
171 |
if maximum_memory_bytes is not None:
|
172 |
import resource
|
173 |
|
174 |
-
resource.setrlimit(
|
175 |
-
|
|
|
|
|
|
|
|
|
176 |
if not platform.uname().system == "Darwin":
|
177 |
-
resource.setrlimit(
|
|
|
|
|
178 |
|
179 |
faulthandler.disable()
|
180 |
|
|
|
25 |
import tempfile
|
26 |
|
27 |
|
28 |
+
def check_correctness(
|
29 |
+
check_program,
|
30 |
+
timeout,
|
31 |
+
task_id,
|
32 |
+
completion_id,
|
33 |
+
ignore_assertion_errors=False,
|
34 |
+
):
|
35 |
"""
|
36 |
Evaluates the functional correctness of a completion by running the test
|
37 |
suite provided in the problem.
|
|
|
42 |
manager = multiprocessing.Manager()
|
43 |
result = manager.list()
|
44 |
|
45 |
+
p = multiprocessing.Process(
|
46 |
+
target=unsafe_execute,
|
47 |
+
args=(check_program, result, timeout, ignore_assertion_errors),
|
48 |
+
)
|
49 |
p.start()
|
50 |
p.join(timeout=timeout + 1)
|
51 |
if p.is_alive():
|
|
|
62 |
)
|
63 |
|
64 |
|
65 |
+
def unsafe_execute(
|
66 |
+
check_program,
|
67 |
+
result,
|
68 |
+
timeout,
|
69 |
+
ignore_assertion_errors=False,
|
70 |
+
):
|
71 |
with create_tempdir():
|
|
|
72 |
# These system calls are needed when cleaning up tempdir.
|
73 |
import os
|
74 |
import shutil
|
|
|
89 |
result.append("passed")
|
90 |
except TimeoutException:
|
91 |
result.append("timed out")
|
92 |
+
except AssertionError as e:
|
93 |
+
if ignore_assertion_errors:
|
94 |
+
result.append("passed")
|
95 |
+
else:
|
96 |
+
result.append(f"failed: {e}")
|
97 |
except BaseException as e:
|
98 |
result.append(f"failed: {e}")
|
99 |
|
|
|
188 |
if maximum_memory_bytes is not None:
|
189 |
import resource
|
190 |
|
191 |
+
resource.setrlimit(
|
192 |
+
resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
|
193 |
+
)
|
194 |
+
resource.setrlimit(
|
195 |
+
resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
|
196 |
+
)
|
197 |
if not platform.uname().system == "Darwin":
|
198 |
+
resource.setrlimit(
|
199 |
+
resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
|
200 |
+
)
|
201 |
|
202 |
faulthandler.disable()
|
203 |
|