jjkim commited on
Commit
ad5b61a
·
1 Parent(s): e12251f

add ignore assertion errors option

Browse files
Files changed (2) hide show
  1. code_eval.py +15 -3
  2. execute.py +31 -8
code_eval.py CHANGED
@@ -168,6 +168,7 @@ class CodeEval(evaluate.Metric):
168
  timeout=3.0,
169
  early_stop=False,
170
  disable_tqdm=False,
 
171
  ):
172
  """Returns the scores"""
173
 
@@ -184,7 +185,11 @@ class CodeEval(evaluate.Metric):
184
  for tid, pred, ref in zip(ids, predictions, references):
185
  results[tid] = []
186
  for pid, p in enumerate(pred):
187
- result = Result(task_id=tid, prediction_id=pid)
 
 
 
 
188
  body = Template(pred_template).safe_substitute(prediction=p)
189
  for r in ref:
190
  assert isinstance(r, str)
@@ -192,7 +197,13 @@ class CodeEval(evaluate.Metric):
192
  test = Template(test).safe_substitute(prediction=p)
193
 
194
  test_program = body + "\n" + test
195
- args = (test_program, timeout, tid, pid)
 
 
 
 
 
 
196
  future = executor.submit(check_correctness, *args)
197
  result.test_program.append(test_program)
198
  result.add(future)
@@ -263,7 +274,8 @@ def estimate_pass_at_k(num_samples, num_correct, k):
263
  class Result(BaseModel):
264
  task_id: str
265
  prediction_id: int
266
-
 
267
  test_program: List[str] = []
268
  passed: Optional[bool] = None
269
  result: List[str] = []
 
168
  timeout=3.0,
169
  early_stop=False,
170
  disable_tqdm=False,
171
+ ignore_assertion_errors=False,
172
  ):
173
  """Returns the scores"""
174
 
 
185
  for tid, pred, ref in zip(ids, predictions, references):
186
  results[tid] = []
187
  for pid, p in enumerate(pred):
188
+ result = Result(
189
+ task_id=tid,
190
+ prediction_id=pid,
191
+ ignore_assertion_error=ignore_assertion_errors,
192
+ )
193
  body = Template(pred_template).safe_substitute(prediction=p)
194
  for r in ref:
195
  assert isinstance(r, str)
 
197
  test = Template(test).safe_substitute(prediction=p)
198
 
199
  test_program = body + "\n" + test
200
+ args = (
201
+ test_program,
202
+ timeout,
203
+ tid,
204
+ pid,
205
+ ignore_assertion_errors,
206
+ )
207
  future = executor.submit(check_correctness, *args)
208
  result.test_program.append(test_program)
209
  result.add(future)
 
274
  class Result(BaseModel):
275
  task_id: str
276
  prediction_id: int
277
+ ignore_assertion_error: bool = False
278
+
279
  test_program: List[str] = []
280
  passed: Optional[bool] = None
281
  result: List[str] = []
execute.py CHANGED
@@ -25,7 +25,13 @@ import signal
25
  import tempfile
26
 
27
 
28
- def check_correctness(check_program, timeout, task_id, completion_id):
 
 
 
 
 
 
29
  """
30
  Evaluates the functional correctness of a completion by running the test
31
  suite provided in the problem.
@@ -36,7 +42,10 @@ def check_correctness(check_program, timeout, task_id, completion_id):
36
  manager = multiprocessing.Manager()
37
  result = manager.list()
38
 
39
- p = multiprocessing.Process(target=unsafe_execute, args=(check_program, result, timeout))
 
 
 
40
  p.start()
41
  p.join(timeout=timeout + 1)
42
  if p.is_alive():
@@ -53,10 +62,13 @@ def check_correctness(check_program, timeout, task_id, completion_id):
53
  )
54
 
55
 
56
- def unsafe_execute(check_program, result, timeout):
57
-
 
 
 
 
58
  with create_tempdir():
59
-
60
  # These system calls are needed when cleaning up tempdir.
61
  import os
62
  import shutil
@@ -77,6 +89,11 @@ def unsafe_execute(check_program, result, timeout):
77
  result.append("passed")
78
  except TimeoutException:
79
  result.append("timed out")
 
 
 
 
 
80
  except BaseException as e:
81
  result.append(f"failed: {e}")
82
 
@@ -171,10 +188,16 @@ def reliability_guard(maximum_memory_bytes=None):
171
  if maximum_memory_bytes is not None:
172
  import resource
173
 
174
- resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
175
- resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
 
 
 
 
176
  if not platform.uname().system == "Darwin":
177
- resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
 
 
178
 
179
  faulthandler.disable()
180
 
 
25
  import tempfile
26
 
27
 
28
+ def check_correctness(
29
+ check_program,
30
+ timeout,
31
+ task_id,
32
+ completion_id,
33
+ ignore_assertion_errors=False,
34
+ ):
35
  """
36
  Evaluates the functional correctness of a completion by running the test
37
  suite provided in the problem.
 
42
  manager = multiprocessing.Manager()
43
  result = manager.list()
44
 
45
+ p = multiprocessing.Process(
46
+ target=unsafe_execute,
47
+ args=(check_program, result, timeout, ignore_assertion_errors),
48
+ )
49
  p.start()
50
  p.join(timeout=timeout + 1)
51
  if p.is_alive():
 
62
  )
63
 
64
 
65
+ def unsafe_execute(
66
+ check_program,
67
+ result,
68
+ timeout,
69
+ ignore_assertion_errors=False,
70
+ ):
71
  with create_tempdir():
 
72
  # These system calls are needed when cleaning up tempdir.
73
  import os
74
  import shutil
 
89
  result.append("passed")
90
  except TimeoutException:
91
  result.append("timed out")
92
+ except AssertionError as e:
93
+ if ignore_assertion_errors:
94
+ result.append("passed")
95
+ else:
96
+ result.append(f"failed: {e}")
97
  except BaseException as e:
98
  result.append(f"failed: {e}")
99
 
 
188
  if maximum_memory_bytes is not None:
189
  import resource
190
 
191
+ resource.setrlimit(
192
+ resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
193
+ )
194
+ resource.setrlimit(
195
+ resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
196
+ )
197
  if not platform.uname().system == "Darwin":
198
+ resource.setrlimit(
199
+ resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
200
+ )
201
 
202
  faulthandler.disable()
203