|
#!/bin/bash
|
|
|
|
PROCESS_FILEPATH=$1
|
|
if [ -z "$PROCESS_FILEPATH" ]; then
|
|
echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
|
|
exit 1
|
|
fi
|
|
|
|
if [ ! -f $PROCESS_FILEPATH ]; then
|
|
echo "Error: $PROCESS_FILEPATH is not a file"
|
|
exit 1
|
|
fi
|
|
|
|
|
|
|
|
INSTANCE_ID=$2
|
|
DATASET_NAME=${3:-"princeton-nlp/SWE-bench_Lite"}
|
|
SPLIT=${4:-"test"}
|
|
|
|
echo "INSTANCE_ID: $INSTANCE_ID"
|
|
echo "DATASET_NAME: $DATASET_NAME"
|
|
echo "SPLIT: $SPLIT"
|
|
|
|
PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
|
|
FILE_DIR=$(dirname $PROCESS_FILEPATH)
|
|
FILE_NAME=$(basename $PROCESS_FILEPATH)
|
|
|
|
echo "Evaluating $FILE_NAME @ $FILE_DIR"
|
|
|
|
|
|
|
|
echo "=============================================================="
|
|
echo "Detecting whether PROCESS_FILEPATH is in OH format or in SWE-bench format"
|
|
echo "=============================================================="
|
|
|
|
function is_swebench_format() {
|
|
|
|
read -r first_line < "$PROCESS_FILEPATH"
|
|
|
|
|
|
echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null
|
|
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
is_swebench_format "$PROCESS_FILEPATH"
|
|
IS_SWEBENCH_FORMAT=$?
|
|
|
|
if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
|
|
echo "The file IS in SWE-bench format."
|
|
SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH
|
|
else
|
|
echo "The file IS NOT in SWE-bench format."
|
|
|
|
|
|
echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
|
|
poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
|
|
|
|
SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
|
|
echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
|
|
|
|
if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then
|
|
echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process."
|
|
exit 1
|
|
fi
|
|
SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL)
|
|
fi
|
|
|
|
|
|
echo "=============================================================="
|
|
echo "Running SWE-bench evaluation"
|
|
echo "=============================================================="
|
|
|
|
RUN_ID=$(date +"%Y%m%d_%H%M%S")
|
|
N_PROCESS=16
|
|
|
|
if [ -z "$INSTANCE_ID" ]; then
|
|
echo "Running SWE-bench evaluation on the whole input file..."
|
|
|
|
|
|
|
|
poetry run python -m swebench.harness.run_evaluation \
|
|
--dataset_name "$DATASET_NAME" \
|
|
--split "$SPLIT" \
|
|
--predictions_path $SWEBENCH_FORMAT_JSONL \
|
|
--timeout 1800 \
|
|
--cache_level instance \
|
|
--max_workers $N_PROCESS \
|
|
--run_id $RUN_ID
|
|
|
|
|
|
MODEL_NAME_OR_PATH=$(jq -r '.model_name_or_path' $SWEBENCH_FORMAT_JSONL | head -n 1)
|
|
echo "MODEL_NAME_OR_PATH: $MODEL_NAME_OR_PATH"
|
|
|
|
RESULT_OUTPUT_DIR=$(dirname $SWEBENCH_FORMAT_JSONL)
|
|
echo "RESULT_OUTPUT_DIR: $RESULT_OUTPUT_DIR"
|
|
|
|
|
|
mkdir -p $RESULT_OUTPUT_DIR
|
|
|
|
if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then
|
|
rm -rf $RESULT_OUTPUT_DIR/eval_outputs
|
|
fi
|
|
|
|
mv logs/run_evaluation/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
|
|
mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
|
|
echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt
|
|
|
|
|
|
REPORT_PATH=$MODEL_NAME_OR_PATH.$RUN_ID.json
|
|
if [ -f $REPORT_PATH ]; then
|
|
|
|
if [ -f $RESULT_OUTPUT_DIR/report.json ]; then
|
|
echo "Report file $RESULT_OUTPUT_DIR/report.json already exists. Overwriting..."
|
|
if [ -f $RESULT_OUTPUT_DIR/report.json.bak ]; then
|
|
rm $RESULT_OUTPUT_DIR/report.json.bak
|
|
fi
|
|
mv $RESULT_OUTPUT_DIR/report.json $RESULT_OUTPUT_DIR/report.json.bak
|
|
fi
|
|
|
|
mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
|
|
fi
|
|
|
|
poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
|
|
|
|
else
|
|
echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
|
|
poetry run python -m swebench.harness.run_evaluation \
|
|
--dataset_name "$DATASET_NAME" \
|
|
--split "$SPLIT" \
|
|
--predictions_path $SWEBENCH_FORMAT_JSONL \
|
|
--timeout 1800 \
|
|
--instance_ids $INSTANCE_ID \
|
|
--cache_level instance \
|
|
--max_workers $N_PROCESS \
|
|
--run_id $RUN_ID
|
|
fi
|
|
|