Spaces:

ar08
/

zzz

Running

App Files Files Community

zzz / evaluation /benchmarks /swe_bench /scripts /eval /summarize_outputs.py

ar08

Upload 1040 files

246d201 verified 3 days ago

raw

history blame contribute delete

10.9 kB

	#!/usr/bin/env python3
	import argparse
	import glob
	import json
	import os
	import random
	from collections import Counter

	import numpy as np
	import pandas as pd

	from openhands.events.serialization import event_from_dict
	from openhands.events.utils import get_pairs_from_events

	ERROR_KEYWORDS = [
	'Agent encountered an error while processing the last action',
	'APIError',
	'Action execution failed',
	'litellm.Timeout: APITimeoutError',
	]


	def get_bootstrap_accuracy_error_bars(
	values: float \| int \| bool, num_samples: int = 1000, p_value=0.05
	) -> tuple[float, float]:
	sorted_vals = np.sort(
	[np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)]
	)
	bottom_idx = int(num_samples * p_value / 2)
	top_idx = int(num_samples * (1.0 - p_value / 2))
	return (sorted_vals[bottom_idx], sorted_vals[top_idx])


	def process_file(file_path):
	with open(file_path, 'r') as file:
	lines = file.readlines()

	num_lines = len(lines)
	num_error_lines = 0
	num_agent_stuck_in_loop = 0
	num_resolved = 0
	resolved_arr = []
	num_empty_patch = 0
	num_unfinished_runs = 0
	error_counter = Counter()
	main_agent_cost = []
	editor_cost = []
	num_turns = []

	for line in lines:
	_d = json.loads(line)

	if 'metrics' not in _d or _d['metrics'] is None:
	# this is a failed run
	num_unfinished_runs += 1
	continue

	# Cost
	costs = _d['metrics'].get('costs', [])
	_cur_main_agent_cost = 0
	_cur_editor_cost = 0
	for cost in costs:
	if isinstance(cost, float):
	# backward compatible
	_cur_main_agent_cost += cost
	else:
	if 'draft_editor' in cost['model']:
	_cur_editor_cost += cost['cost']
	else:
	_cur_main_agent_cost += cost['cost']

	main_agent_cost.append(_cur_main_agent_cost)
	editor_cost.append(_cur_editor_cost)

	# Turn status
	history = _d.get('history', [])
	events = [event_from_dict(event) for event in history]
	pairs = get_pairs_from_events(events)
	num_turns.append(len(pairs))

	# Patch & resolve status
	patch = _d.get('test_result', {}).get('git_patch', '')
	if patch == '':
	num_empty_patch += 1
	continue

	report = _d.get('report', {}) or {}
	resolved = report.get('resolved', False)
	if resolved:
	num_resolved += 1
	resolved_arr.append(1)
	else:
	resolved_arr.append(0)

	# Error
	error = _d.get('error', None)

	if error is not None and isinstance(error, str):
	agent_stuck_in_loop = 'Agent got stuck in a loop' in error
	contains_error = bool(error) and not agent_stuck_in_loop
	if agent_stuck_in_loop:
	error_counter['Agent got stuck in a loop'] += 1
	num_agent_stuck_in_loop += 1
	elif contains_error:
	error_counter[error] += 1
	continue

	for keyword in ERROR_KEYWORDS:
	if keyword in line:
	error_counter[keyword] += 1
	num_error_lines += 1
	break

	return {
	'file_path': file_path,
	'total_instances': num_lines,
	'resolved': {
	'count': num_resolved,
	'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
	'ci': tuple(
	x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr)
	),
	},
	'empty_patches': {
	'count': num_empty_patch,
	'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
	},
	'unfinished_runs': {
	'count': num_unfinished_runs,
	'percentage': (num_unfinished_runs / num_lines * 100)
	if num_lines > 0
	else 0,
	},
	'errors': {
	'total': num_error_lines,
	'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
	'stuck_in_loop': {
	'count': num_agent_stuck_in_loop,
	'percentage': (num_agent_stuck_in_loop / num_lines * 100)
	if num_lines > 0
	else 0,
	},
	'breakdown': {
	str(error): {
	'count': count,
	'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
	}
	for error, count in error_counter.items()
	},
	},
	'costs': {
	'main_agent': sum(main_agent_cost),
	'editor': sum(editor_cost),
	'total': sum(main_agent_cost) + sum(editor_cost),
	},
	'statistics': {
	'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
	'costs': {
	'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
	'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
	'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
	if num_lines > 0
	else 0,
	},
	},
	}


	def aggregate_directory(input_path) -> pd.DataFrame:
	# Process all output.jsonl files in subdirectories
	pattern = os.path.join(input_path, '**/output.jsonl')
	files = glob.glob(pattern, recursive=True)
	print(f'Processing {len(files)} files from directory {input_path}')

	# Process each file silently and collect results
	results = []
	for file_path in files:
	try:
	result = process_file(file_path)
	results.append(result)
	except Exception as e:
	print(f'Error processing {file_path}: {str(e)}')
	import traceback

	traceback.print_exc()
	continue

	# Convert results to pandas DataFrame and sort by resolve rate
	df = pd.DataFrame(results)

	# Extract directory name from file path
	df['directory'] = df['file_path'].apply(
	lambda x: os.path.basename(os.path.dirname(x))
	)

	df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
	df['resolve_rate_ci'] = df['resolved'].apply(lambda x: x['ci'])
	df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
	df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
	df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
	df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
	df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])

	df = df.sort_values('resolve_rate', ascending=False)

	return df


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument(
	'input_path', type=str, help='The file or directory to summarize'
	)
	parser.add_argument(
	'--output',
	type=str,
	help='Output JSONL file for results',
	default='summary_results.jsonl',
	)
	args = parser.parse_args()

	if os.path.isdir(args.input_path):
	df = aggregate_directory(args.input_path)
	# Create the summary string
	columns = [
	'directory',
	'resolve_rate',
	'empty_patch_rate',
	'unfinished_rate',
	'error_rate',
	'avg_turns',
	'avg_cost',
	'total_instances',
	]
	summary_str = df[columns].to_string(
	float_format=lambda x: '{:.2f}'.format(x),
	formatters={
	'directory': lambda x: x[:90]
	}, # Truncate directory names to 20 chars
	index=False,
	)

	# Print to console
	print('\nResults summary (sorted by resolve rate):')
	print(summary_str)

	# Save to text file
	txt_output = args.output.rsplit('.', 1)[0] + '.txt'
	with open(txt_output, 'w') as f:
	f.write('Results summary (sorted by resolve rate):\n')
	f.write(summary_str)

	# Save
	df.to_json(args.output, lines=True, orient='records')
	df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
	else:
	# Process single file with detailed output
	results = []
	try:
	result = process_file(args.input_path)
	results.append(result)

	# Print detailed results for single file
	print(f'\nResults for {args.input_path}:')
	print(
	f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% [{result['resolved']['ci'][0]:.2f}%, {result['resolved']['ci'][1]:.2f}%])"
	)
	print(
	f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
	)
	print(
	f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
	)
	print(
	f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
	)
	print(
	f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
	)
	print(f"Total cost: {result['costs']['total']:.2f} USD")
	print('## Statistics')
	print(
	f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
	)
	print(
	f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
	)
	print(
	f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
	)
	print(
	f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
	)

	print('## Detailed error breakdown:')
	for error, data in result['errors']['breakdown'].items():
	print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")

	except Exception as e:
	print(f'Error processing {args.input_path}: {str(e)}')