user-friendly-metrics / user-friendly-metrics.py
Gil-Simas's picture
filters -> filter
c62f523
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import os
import datasets
import evaluate
from seametrics.user_friendly.utils import calculate_from_payload
import wandb
_CITATION = """\
@InProceedings{huggingface:module,
title = {A great new module},
authors={huggingface, Inc.},
year={2020}
}\
@article{milan2016mot16,
title={MOT16: A benchmark for multi-object tracking},
author={Milan, Anton and Leal-Taix{\'e}, Laura and Reid, Ian and Roth, Stefan and Schindler, Konrad},
journal={arXiv preprint arXiv:1603.00831},
year={2016}
}
"""
_DESCRIPTION = """\
The MOT Metrics module is designed to evaluate multi-object tracking (MOT)
algorithms by computing various metrics based on predicted and ground truth bounding
boxes. It serves as a crucial tool in assessing the performance of MOT systems,
aiding in the iterative improvement of tracking algorithms."""
_KWARGS_DESCRIPTION = """
Calculates how good are predictions given some references, using certain scores
Args:
predictions: list of predictions to score. Each predictions
should be a string with tokens separated by spaces.
references: list of reference for each prediction. Each
reference should be a string with tokens separated by spaces.
max_iou (`float`, *optional*):
If specified, this is the minimum Intersection over Union (IoU) threshold to consider a detection as a true positive.
Default is 0.5.
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class UserFriendlyMetrics(evaluate.Metric):
"""TODO: Short description of my evaluation module."""
def _info(self):
# TODO: Specifies the evaluate.EvaluationModuleInfo object
return evaluate.MetricInfo(
# This is the description that will appear on the modules page.
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=datasets.Features(
{
"predictions": datasets.Sequence(
datasets.Sequence(datasets.Value("float"))
),
"references": datasets.Sequence(
datasets.Sequence(datasets.Value("float"))
),
}
),
# Additional links to the codebase or references
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
reference_urls=["http://path.to.reference.url/new_module"],
)
def _download_and_prepare(self, dl_manager):
"""Optional: download external resources useful to compute the scores"""
# TODO: Download external resources if needed
pass
def compute_from_payload(self, payload, **kwargs):
return self._compute(payload, **kwargs)
def _compute(
self,
payload,
max_iou: float = 0.5,
filter={},
recognition_thresholds=[0.3, 0.5, 0.8],
debug: bool = False,
):
"""Returns the scores"""
# TODO: Compute the different scores of the module
return calculate_from_payload(
payload, max_iou, filter, recognition_thresholds, debug
)
# return calculate(predictions, references, max_iou)
def wandb(
self,
results,
wandb_section: str = None,
wandb_runs = None,
wandb_project="user_friendly_metrics",
log_plots: bool = True,
debug: bool = False,
):
"""
Logs metrics to Weights and Biases (wandb) for tracking and visualization, including categorized bar charts for overall metrics.
Args:
results (dict): Results dictionary with 'overall' and 'per_sequence' keys.
wandb_section (str, optional): W&B section for metric grouping. Defaults to None.
wandb_project (str, optional): The name of the wandb project. Defaults to 'user_friendly_metrics'.
log_plots (bool, optional): Generates categorized bar charts for overall metrics. Defaults to True.
debug (bool, optional): Logs detailed summaries and histories to the terminal console. Defaults to False.
"""
current_datetime = datetime.datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
wandb.login(key=os.getenv("WANDB_API_KEY"))
if wandb_runs is not None:
assert len(wandb_runs) == len(results), "runs and results must have the same length"
else:
wandb_runs = [f"{i}_{formatted_datetime}" for i in list(results.keys())]
for wandb_run_name, result in zip(wandb_runs, results.values()):
self.wandb_run(result = result,
wandb_run_name = wandb_run_name,
wandb_project = wandb_project,
debug = debug)
def wandb_run(self, result, wandb_run_name, wandb_project, debug, wandb_section = None, log_plots = True):
run = wandb.init(
project = wandb_project,
name = wandb_run_name,
reinit = True,
settings = wandb.Settings(silent=not debug),
)
categories = {
"user_friendly_metrics": {
"mostly_tracked_score_0.3",
"mostly_tracked_score_0.5",
"mostly_tracked_score_0.8",
},
"evaluation_metrics_dev": {
"recall",
},
"user_friendly_metrics_dev": {
"mostly_tracked_count_0.3",
"mostly_tracked_count_0.5",
"mostly_tracked_count_0.8",
"unique_obj_count",
},
"predictions_summary": {
"tp",
"fn",
},
}
chart_data = {key: [] for key in categories.keys()}
# Log overall metrics
if "overall" in result:
for metric, value in result["overall"]["all"].items():
log_key = (
f"{wandb_section}/overall/{metric}"
if wandb_section
else f"overall/{metric}"
)
run.log({log_key: value})
if debug:
print(f" {log_key} = {value}")
for category, metrics in categories.items():
if metric in metrics:
chart_data[category].append([metric, value])
print("----------------------------------------------------")
if log_plots:
for category, data in chart_data.items():
if data:
table_data = [[label, value] for label, value in data]
table = wandb.Table(data=table_data, columns=["metrics", "value"])
run.log(
{
f"{category}_bar_chart": wandb.plot.bar(
table,
"metrics",
"value",
title=f"{category.replace('_', ' ').title()}",
)
}
)
if "per_sequence" in result:
sorted_sequences = sorted(
result["per_sequence"].items(),
key=lambda x: next(iter(x[1].values()), {}).get("all", {}).get("f1", 0),
reverse=True, # Set to True for descending order
)
for sequence_name, sequence_data in sorted_sequences:
for metric, value in sequence_data["all"].items():
log_key = (
f"{wandb_section}/per_sequence/{sequence_name}/{metric}"
if wandb_section
else f"per_sequence/{sequence_name}/{metric}"
)
run.log({log_key: value})
if debug:
print(f" {log_key} = {value}")
print("----------------------------------------------------")
if debug:
print("\nDebug Mode: Logging Summary and History")
print(f"Results Summary:\n{result}")
print(f"WandB Settings:\n{run.settings}")
print("All metrics have been logged.")
run.finish()