File size: 10,856 Bytes
246d201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/env python3
import argparse
import glob
import json
import os
import random
from collections import Counter

import numpy as np
import pandas as pd

from openhands.events.serialization import event_from_dict
from openhands.events.utils import get_pairs_from_events

ERROR_KEYWORDS = [
    'Agent encountered an error while processing the last action',
    'APIError',
    'Action execution failed',
    'litellm.Timeout: APITimeoutError',
]


def get_bootstrap_accuracy_error_bars(

    values: float | int | bool, num_samples: int = 1000, p_value=0.05

) -> tuple[float, float]:
    sorted_vals = np.sort(
        [np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)]
    )
    bottom_idx = int(num_samples * p_value / 2)
    top_idx = int(num_samples * (1.0 - p_value / 2))
    return (sorted_vals[bottom_idx], sorted_vals[top_idx])


def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    num_lines = len(lines)
    num_error_lines = 0
    num_agent_stuck_in_loop = 0
    num_resolved = 0
    resolved_arr = []
    num_empty_patch = 0
    num_unfinished_runs = 0
    error_counter = Counter()
    main_agent_cost = []
    editor_cost = []
    num_turns = []

    for line in lines:
        _d = json.loads(line)

        if 'metrics' not in _d or _d['metrics'] is None:
            # this is a failed run
            num_unfinished_runs += 1
            continue

        # Cost
        costs = _d['metrics'].get('costs', [])
        _cur_main_agent_cost = 0
        _cur_editor_cost = 0
        for cost in costs:
            if isinstance(cost, float):
                # backward compatible
                _cur_main_agent_cost += cost
            else:
                if 'draft_editor' in cost['model']:
                    _cur_editor_cost += cost['cost']
                else:
                    _cur_main_agent_cost += cost['cost']

        main_agent_cost.append(_cur_main_agent_cost)
        editor_cost.append(_cur_editor_cost)

        # Turn status
        history = _d.get('history', [])
        events = [event_from_dict(event) for event in history]
        pairs = get_pairs_from_events(events)
        num_turns.append(len(pairs))

        # Patch & resolve status
        patch = _d.get('test_result', {}).get('git_patch', '')
        if patch == '':
            num_empty_patch += 1
            continue

        report = _d.get('report', {}) or {}
        resolved = report.get('resolved', False)
        if resolved:
            num_resolved += 1
            resolved_arr.append(1)
        else:
            resolved_arr.append(0)

        # Error
        error = _d.get('error', None)

        if error is not None and isinstance(error, str):
            agent_stuck_in_loop = 'Agent got stuck in a loop' in error
            contains_error = bool(error) and not agent_stuck_in_loop
            if agent_stuck_in_loop:
                error_counter['Agent got stuck in a loop'] += 1
                num_agent_stuck_in_loop += 1
            elif contains_error:
                error_counter[error] += 1
            continue

        for keyword in ERROR_KEYWORDS:
            if keyword in line:
                error_counter[keyword] += 1
                num_error_lines += 1
                break

    return {
        'file_path': file_path,
        'total_instances': num_lines,
        'resolved': {
            'count': num_resolved,
            'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
            'ci': tuple(
                x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr)
            ),
        },
        'empty_patches': {
            'count': num_empty_patch,
            'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
        },
        'unfinished_runs': {
            'count': num_unfinished_runs,
            'percentage': (num_unfinished_runs / num_lines * 100)
            if num_lines > 0
            else 0,
        },
        'errors': {
            'total': num_error_lines,
            'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
            'stuck_in_loop': {
                'count': num_agent_stuck_in_loop,
                'percentage': (num_agent_stuck_in_loop / num_lines * 100)
                if num_lines > 0
                else 0,
            },
            'breakdown': {
                str(error): {
                    'count': count,
                    'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
                }
                for error, count in error_counter.items()
            },
        },
        'costs': {
            'main_agent': sum(main_agent_cost),
            'editor': sum(editor_cost),
            'total': sum(main_agent_cost) + sum(editor_cost),
        },
        'statistics': {
            'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
            'costs': {
                'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
                'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
                'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
                if num_lines > 0
                else 0,
            },
        },
    }


def aggregate_directory(input_path) -> pd.DataFrame:
    # Process all output.jsonl files in subdirectories
    pattern = os.path.join(input_path, '**/output.jsonl')
    files = glob.glob(pattern, recursive=True)
    print(f'Processing {len(files)} files from directory {input_path}')

    # Process each file silently and collect results
    results = []
    for file_path in files:
        try:
            result = process_file(file_path)
            results.append(result)
        except Exception as e:
            print(f'Error processing {file_path}: {str(e)}')
            import traceback

            traceback.print_exc()
            continue

    # Convert results to pandas DataFrame and sort by resolve rate
    df = pd.DataFrame(results)

    # Extract directory name from file path
    df['directory'] = df['file_path'].apply(
        lambda x: os.path.basename(os.path.dirname(x))
    )

    df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
    df['resolve_rate_ci'] = df['resolved'].apply(lambda x: x['ci'])
    df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
    df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
    df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
    df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
    df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])

    df = df.sort_values('resolve_rate', ascending=False)

    return df


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'input_path', type=str, help='The file or directory to summarize'
    )
    parser.add_argument(
        '--output',
        type=str,
        help='Output JSONL file for results',
        default='summary_results.jsonl',
    )
    args = parser.parse_args()

    if os.path.isdir(args.input_path):
        df = aggregate_directory(args.input_path)
        # Create the summary string
        columns = [
            'directory',
            'resolve_rate',
            'empty_patch_rate',
            'unfinished_rate',
            'error_rate',
            'avg_turns',
            'avg_cost',
            'total_instances',
        ]
        summary_str = df[columns].to_string(
            float_format=lambda x: '{:.2f}'.format(x),
            formatters={
                'directory': lambda x: x[:90]
            },  # Truncate directory names to 20 chars
            index=False,
        )

        # Print to console
        print('\nResults summary (sorted by resolve rate):')
        print(summary_str)

        # Save to text file
        txt_output = args.output.rsplit('.', 1)[0] + '.txt'
        with open(txt_output, 'w') as f:
            f.write('Results summary (sorted by resolve rate):\n')
            f.write(summary_str)

        # Save
        df.to_json(args.output, lines=True, orient='records')
        df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
    else:
        # Process single file with detailed output
        results = []
        try:
            result = process_file(args.input_path)
            results.append(result)

            # Print detailed results for single file
            print(f'\nResults for {args.input_path}:')
            print(
                f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% [{result['resolved']['ci'][0]:.2f}%, {result['resolved']['ci'][1]:.2f}%])"
            )
            print(
                f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
            )
            print(
                f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
            )
            print(
                f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
            )
            print(
                f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
            )
            print(f"Total cost: {result['costs']['total']:.2f} USD")
            print('## Statistics')
            print(
                f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
            )
            print(
                f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
            )
            print(
                f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
            )
            print(
                f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
            )

            print('## Detailed error breakdown:')
            for error, data in result['errors']['breakdown'].items():
                print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")

        except Exception as e:
            print(f'Error processing {args.input_path}: {str(e)}')