|
|
|
import argparse
|
|
import glob
|
|
import json
|
|
import os
|
|
import random
|
|
from collections import Counter
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from openhands.events.serialization import event_from_dict
|
|
from openhands.events.utils import get_pairs_from_events
|
|
|
|
ERROR_KEYWORDS = [
|
|
'Agent encountered an error while processing the last action',
|
|
'APIError',
|
|
'Action execution failed',
|
|
'litellm.Timeout: APITimeoutError',
|
|
]
|
|
|
|
|
|
def get_bootstrap_accuracy_error_bars(
|
|
values: float | int | bool, num_samples: int = 1000, p_value=0.05
|
|
) -> tuple[float, float]:
|
|
sorted_vals = np.sort(
|
|
[np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)]
|
|
)
|
|
bottom_idx = int(num_samples * p_value / 2)
|
|
top_idx = int(num_samples * (1.0 - p_value / 2))
|
|
return (sorted_vals[bottom_idx], sorted_vals[top_idx])
|
|
|
|
|
|
def process_file(file_path):
|
|
with open(file_path, 'r') as file:
|
|
lines = file.readlines()
|
|
|
|
num_lines = len(lines)
|
|
num_error_lines = 0
|
|
num_agent_stuck_in_loop = 0
|
|
num_resolved = 0
|
|
resolved_arr = []
|
|
num_empty_patch = 0
|
|
num_unfinished_runs = 0
|
|
error_counter = Counter()
|
|
main_agent_cost = []
|
|
editor_cost = []
|
|
num_turns = []
|
|
|
|
for line in lines:
|
|
_d = json.loads(line)
|
|
|
|
if 'metrics' not in _d or _d['metrics'] is None:
|
|
|
|
num_unfinished_runs += 1
|
|
continue
|
|
|
|
|
|
costs = _d['metrics'].get('costs', [])
|
|
_cur_main_agent_cost = 0
|
|
_cur_editor_cost = 0
|
|
for cost in costs:
|
|
if isinstance(cost, float):
|
|
|
|
_cur_main_agent_cost += cost
|
|
else:
|
|
if 'draft_editor' in cost['model']:
|
|
_cur_editor_cost += cost['cost']
|
|
else:
|
|
_cur_main_agent_cost += cost['cost']
|
|
|
|
main_agent_cost.append(_cur_main_agent_cost)
|
|
editor_cost.append(_cur_editor_cost)
|
|
|
|
|
|
history = _d.get('history', [])
|
|
events = [event_from_dict(event) for event in history]
|
|
pairs = get_pairs_from_events(events)
|
|
num_turns.append(len(pairs))
|
|
|
|
|
|
patch = _d.get('test_result', {}).get('git_patch', '')
|
|
if patch == '':
|
|
num_empty_patch += 1
|
|
continue
|
|
|
|
report = _d.get('report', {}) or {}
|
|
resolved = report.get('resolved', False)
|
|
if resolved:
|
|
num_resolved += 1
|
|
resolved_arr.append(1)
|
|
else:
|
|
resolved_arr.append(0)
|
|
|
|
|
|
error = _d.get('error', None)
|
|
|
|
if error is not None and isinstance(error, str):
|
|
agent_stuck_in_loop = 'Agent got stuck in a loop' in error
|
|
contains_error = bool(error) and not agent_stuck_in_loop
|
|
if agent_stuck_in_loop:
|
|
error_counter['Agent got stuck in a loop'] += 1
|
|
num_agent_stuck_in_loop += 1
|
|
elif contains_error:
|
|
error_counter[error] += 1
|
|
continue
|
|
|
|
for keyword in ERROR_KEYWORDS:
|
|
if keyword in line:
|
|
error_counter[keyword] += 1
|
|
num_error_lines += 1
|
|
break
|
|
|
|
return {
|
|
'file_path': file_path,
|
|
'total_instances': num_lines,
|
|
'resolved': {
|
|
'count': num_resolved,
|
|
'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
|
|
'ci': tuple(
|
|
x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr)
|
|
),
|
|
},
|
|
'empty_patches': {
|
|
'count': num_empty_patch,
|
|
'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
|
|
},
|
|
'unfinished_runs': {
|
|
'count': num_unfinished_runs,
|
|
'percentage': (num_unfinished_runs / num_lines * 100)
|
|
if num_lines > 0
|
|
else 0,
|
|
},
|
|
'errors': {
|
|
'total': num_error_lines,
|
|
'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
|
|
'stuck_in_loop': {
|
|
'count': num_agent_stuck_in_loop,
|
|
'percentage': (num_agent_stuck_in_loop / num_lines * 100)
|
|
if num_lines > 0
|
|
else 0,
|
|
},
|
|
'breakdown': {
|
|
str(error): {
|
|
'count': count,
|
|
'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
|
|
}
|
|
for error, count in error_counter.items()
|
|
},
|
|
},
|
|
'costs': {
|
|
'main_agent': sum(main_agent_cost),
|
|
'editor': sum(editor_cost),
|
|
'total': sum(main_agent_cost) + sum(editor_cost),
|
|
},
|
|
'statistics': {
|
|
'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
|
|
'costs': {
|
|
'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
|
|
'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
|
|
'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
|
|
if num_lines > 0
|
|
else 0,
|
|
},
|
|
},
|
|
}
|
|
|
|
|
|
def aggregate_directory(input_path) -> pd.DataFrame:
|
|
|
|
pattern = os.path.join(input_path, '**/output.jsonl')
|
|
files = glob.glob(pattern, recursive=True)
|
|
print(f'Processing {len(files)} files from directory {input_path}')
|
|
|
|
|
|
results = []
|
|
for file_path in files:
|
|
try:
|
|
result = process_file(file_path)
|
|
results.append(result)
|
|
except Exception as e:
|
|
print(f'Error processing {file_path}: {str(e)}')
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
continue
|
|
|
|
|
|
df = pd.DataFrame(results)
|
|
|
|
|
|
df['directory'] = df['file_path'].apply(
|
|
lambda x: os.path.basename(os.path.dirname(x))
|
|
)
|
|
|
|
df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
|
|
df['resolve_rate_ci'] = df['resolved'].apply(lambda x: x['ci'])
|
|
df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
|
|
df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
|
|
df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
|
|
df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
|
|
df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
|
|
|
|
df = df.sort_values('resolve_rate', ascending=False)
|
|
|
|
return df
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
'input_path', type=str, help='The file or directory to summarize'
|
|
)
|
|
parser.add_argument(
|
|
'--output',
|
|
type=str,
|
|
help='Output JSONL file for results',
|
|
default='summary_results.jsonl',
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if os.path.isdir(args.input_path):
|
|
df = aggregate_directory(args.input_path)
|
|
|
|
columns = [
|
|
'directory',
|
|
'resolve_rate',
|
|
'empty_patch_rate',
|
|
'unfinished_rate',
|
|
'error_rate',
|
|
'avg_turns',
|
|
'avg_cost',
|
|
'total_instances',
|
|
]
|
|
summary_str = df[columns].to_string(
|
|
float_format=lambda x: '{:.2f}'.format(x),
|
|
formatters={
|
|
'directory': lambda x: x[:90]
|
|
},
|
|
index=False,
|
|
)
|
|
|
|
|
|
print('\nResults summary (sorted by resolve rate):')
|
|
print(summary_str)
|
|
|
|
|
|
txt_output = args.output.rsplit('.', 1)[0] + '.txt'
|
|
with open(txt_output, 'w') as f:
|
|
f.write('Results summary (sorted by resolve rate):\n')
|
|
f.write(summary_str)
|
|
|
|
|
|
df.to_json(args.output, lines=True, orient='records')
|
|
df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
|
|
else:
|
|
|
|
results = []
|
|
try:
|
|
result = process_file(args.input_path)
|
|
results.append(result)
|
|
|
|
|
|
print(f'\nResults for {args.input_path}:')
|
|
print(
|
|
f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% [{result['resolved']['ci'][0]:.2f}%, {result['resolved']['ci'][1]:.2f}%])"
|
|
)
|
|
print(
|
|
f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
|
|
)
|
|
print(
|
|
f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
|
|
)
|
|
print(
|
|
f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
|
|
)
|
|
print(
|
|
f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
|
|
)
|
|
print(f"Total cost: {result['costs']['total']:.2f} USD")
|
|
print('## Statistics')
|
|
print(
|
|
f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
|
|
)
|
|
print(
|
|
f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
|
|
)
|
|
print(
|
|
f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
|
|
)
|
|
print(
|
|
f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
|
|
)
|
|
|
|
print('## Detailed error breakdown:')
|
|
for error, data in result['errors']['breakdown'].items():
|
|
print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
|
|
|
|
except Exception as e:
|
|
print(f'Error processing {args.input_path}: {str(e)}')
|
|
|