|
import argparse
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
|
|
def extract_test_results(df: pd.DataFrame) -> tuple[list[str], list[str]]:
|
|
passed = []
|
|
failed = []
|
|
for _, row in df.iterrows():
|
|
instance_id = row['instance_id']
|
|
resolved = False
|
|
if 'test_result' in row and 'exit_code' in row['test_result']:
|
|
resolved = row['test_result']['exit_code'] == 0
|
|
if resolved:
|
|
passed.append(instance_id)
|
|
else:
|
|
failed.append(instance_id)
|
|
return passed, failed
|
|
|
|
|
|
def visualize_results(df: pd.DataFrame):
|
|
df1 = pd.DataFrame()
|
|
df1['cost'] = df['metrics'].apply(pd.Series)['accumulated_cost']
|
|
df1['result'] = (
|
|
df['test_result'].apply(pd.Series)['exit_code'].map({0: 'Pass', 1: 'Fail'})
|
|
)
|
|
df1['actions'] = pd.Series([len(a) - 1 for a in df['history']])
|
|
|
|
passed = np.sum(df1['result'] == 'Pass')
|
|
total = df.shape[0]
|
|
resolve_rate = round((passed / total) * 100, 2)
|
|
|
|
print('Number of passed tests:', f'{passed}/{total} {resolve_rate:.2f}%')
|
|
print('\nDescriptive statistics for number of actions:')
|
|
print(df1['actions'].describe())
|
|
print('\nDescriptive statistics for costs:')
|
|
print(df1['cost'].describe())
|
|
|
|
|
|
action_bins = pd.cut(df1['actions'], bins=range(0, 32, 2))
|
|
print('\nAction bin counts:')
|
|
print(action_bins.value_counts().sort_index())
|
|
|
|
|
|
cost_bins = pd.cut(df1['cost'], bins=10)
|
|
print('\nCost bin counts:')
|
|
print(cost_bins.value_counts().sort_index())
|
|
|
|
return resolve_rate
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description='Summarize AiderBench results')
|
|
parser.add_argument('input_filepath', type=str, help='Path to the JSONL file')
|
|
args = parser.parse_args()
|
|
|
|
|
|
df = pd.read_json(args.input_filepath, lines=True)
|
|
|
|
passed_tests, failed_tests = extract_test_results(df)
|
|
resolve_rate = visualize_results(df)
|
|
|
|
print(
|
|
f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {resolve_rate:.2f}%'
|
|
)
|
|
print('PASSED TESTS:')
|
|
print(passed_tests)
|
|
print('FAILED TESTS:')
|
|
print(failed_tests)
|
|
|