File size: 10,856 Bytes
246d201 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
#!/usr/bin/env python3
import argparse
import glob
import json
import os
import random
from collections import Counter
import numpy as np
import pandas as pd
from openhands.events.serialization import event_from_dict
from openhands.events.utils import get_pairs_from_events
ERROR_KEYWORDS = [
'Agent encountered an error while processing the last action',
'APIError',
'Action execution failed',
'litellm.Timeout: APITimeoutError',
]
def get_bootstrap_accuracy_error_bars(
values: float | int | bool, num_samples: int = 1000, p_value=0.05
) -> tuple[float, float]:
sorted_vals = np.sort(
[np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)]
)
bottom_idx = int(num_samples * p_value / 2)
top_idx = int(num_samples * (1.0 - p_value / 2))
return (sorted_vals[bottom_idx], sorted_vals[top_idx])
def process_file(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
num_lines = len(lines)
num_error_lines = 0
num_agent_stuck_in_loop = 0
num_resolved = 0
resolved_arr = []
num_empty_patch = 0
num_unfinished_runs = 0
error_counter = Counter()
main_agent_cost = []
editor_cost = []
num_turns = []
for line in lines:
_d = json.loads(line)
if 'metrics' not in _d or _d['metrics'] is None:
# this is a failed run
num_unfinished_runs += 1
continue
# Cost
costs = _d['metrics'].get('costs', [])
_cur_main_agent_cost = 0
_cur_editor_cost = 0
for cost in costs:
if isinstance(cost, float):
# backward compatible
_cur_main_agent_cost += cost
else:
if 'draft_editor' in cost['model']:
_cur_editor_cost += cost['cost']
else:
_cur_main_agent_cost += cost['cost']
main_agent_cost.append(_cur_main_agent_cost)
editor_cost.append(_cur_editor_cost)
# Turn status
history = _d.get('history', [])
events = [event_from_dict(event) for event in history]
pairs = get_pairs_from_events(events)
num_turns.append(len(pairs))
# Patch & resolve status
patch = _d.get('test_result', {}).get('git_patch', '')
if patch == '':
num_empty_patch += 1
continue
report = _d.get('report', {}) or {}
resolved = report.get('resolved', False)
if resolved:
num_resolved += 1
resolved_arr.append(1)
else:
resolved_arr.append(0)
# Error
error = _d.get('error', None)
if error is not None and isinstance(error, str):
agent_stuck_in_loop = 'Agent got stuck in a loop' in error
contains_error = bool(error) and not agent_stuck_in_loop
if agent_stuck_in_loop:
error_counter['Agent got stuck in a loop'] += 1
num_agent_stuck_in_loop += 1
elif contains_error:
error_counter[error] += 1
continue
for keyword in ERROR_KEYWORDS:
if keyword in line:
error_counter[keyword] += 1
num_error_lines += 1
break
return {
'file_path': file_path,
'total_instances': num_lines,
'resolved': {
'count': num_resolved,
'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
'ci': tuple(
x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr)
),
},
'empty_patches': {
'count': num_empty_patch,
'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
},
'unfinished_runs': {
'count': num_unfinished_runs,
'percentage': (num_unfinished_runs / num_lines * 100)
if num_lines > 0
else 0,
},
'errors': {
'total': num_error_lines,
'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
'stuck_in_loop': {
'count': num_agent_stuck_in_loop,
'percentage': (num_agent_stuck_in_loop / num_lines * 100)
if num_lines > 0
else 0,
},
'breakdown': {
str(error): {
'count': count,
'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
}
for error, count in error_counter.items()
},
},
'costs': {
'main_agent': sum(main_agent_cost),
'editor': sum(editor_cost),
'total': sum(main_agent_cost) + sum(editor_cost),
},
'statistics': {
'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
'costs': {
'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
if num_lines > 0
else 0,
},
},
}
def aggregate_directory(input_path) -> pd.DataFrame:
# Process all output.jsonl files in subdirectories
pattern = os.path.join(input_path, '**/output.jsonl')
files = glob.glob(pattern, recursive=True)
print(f'Processing {len(files)} files from directory {input_path}')
# Process each file silently and collect results
results = []
for file_path in files:
try:
result = process_file(file_path)
results.append(result)
except Exception as e:
print(f'Error processing {file_path}: {str(e)}')
import traceback
traceback.print_exc()
continue
# Convert results to pandas DataFrame and sort by resolve rate
df = pd.DataFrame(results)
# Extract directory name from file path
df['directory'] = df['file_path'].apply(
lambda x: os.path.basename(os.path.dirname(x))
)
df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
df['resolve_rate_ci'] = df['resolved'].apply(lambda x: x['ci'])
df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
df = df.sort_values('resolve_rate', ascending=False)
return df
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'input_path', type=str, help='The file or directory to summarize'
)
parser.add_argument(
'--output',
type=str,
help='Output JSONL file for results',
default='summary_results.jsonl',
)
args = parser.parse_args()
if os.path.isdir(args.input_path):
df = aggregate_directory(args.input_path)
# Create the summary string
columns = [
'directory',
'resolve_rate',
'empty_patch_rate',
'unfinished_rate',
'error_rate',
'avg_turns',
'avg_cost',
'total_instances',
]
summary_str = df[columns].to_string(
float_format=lambda x: '{:.2f}'.format(x),
formatters={
'directory': lambda x: x[:90]
}, # Truncate directory names to 20 chars
index=False,
)
# Print to console
print('\nResults summary (sorted by resolve rate):')
print(summary_str)
# Save to text file
txt_output = args.output.rsplit('.', 1)[0] + '.txt'
with open(txt_output, 'w') as f:
f.write('Results summary (sorted by resolve rate):\n')
f.write(summary_str)
# Save
df.to_json(args.output, lines=True, orient='records')
df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
else:
# Process single file with detailed output
results = []
try:
result = process_file(args.input_path)
results.append(result)
# Print detailed results for single file
print(f'\nResults for {args.input_path}:')
print(
f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% [{result['resolved']['ci'][0]:.2f}%, {result['resolved']['ci'][1]:.2f}%])"
)
print(
f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
)
print(
f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
)
print(
f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
)
print(
f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
)
print(f"Total cost: {result['costs']['total']:.2f} USD")
print('## Statistics')
print(
f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
)
print(
f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
)
print(
f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
)
print(
f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
)
print('## Detailed error breakdown:')
for error, data in result['errors']['breakdown'].items():
print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
except Exception as e:
print(f'Error processing {args.input_path}: {str(e)}')
|