Spaces:
Sleeping
Sleeping
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import datetime | |
import os | |
import datasets | |
import evaluate | |
from seametrics.user_friendly.utils import calculate_from_payload | |
import wandb | |
_CITATION = """\ | |
@InProceedings{huggingface:module, | |
title = {A great new module}, | |
authors={huggingface, Inc.}, | |
year={2020} | |
}\ | |
@article{milan2016mot16, | |
title={MOT16: A benchmark for multi-object tracking}, | |
author={Milan, Anton and Leal-Taix{\'e}, Laura and Reid, Ian and Roth, Stefan and Schindler, Konrad}, | |
journal={arXiv preprint arXiv:1603.00831}, | |
year={2016} | |
} | |
""" | |
_DESCRIPTION = """\ | |
The MOT Metrics module is designed to evaluate multi-object tracking (MOT) | |
algorithms by computing various metrics based on predicted and ground truth bounding | |
boxes. It serves as a crucial tool in assessing the performance of MOT systems, | |
aiding in the iterative improvement of tracking algorithms.""" | |
_KWARGS_DESCRIPTION = """ | |
Calculates how good are predictions given some references, using certain scores | |
Args: | |
predictions: list of predictions to score. Each predictions | |
should be a string with tokens separated by spaces. | |
references: list of reference for each prediction. Each | |
reference should be a string with tokens separated by spaces. | |
max_iou (`float`, *optional*): | |
If specified, this is the minimum Intersection over Union (IoU) threshold to consider a detection as a true positive. | |
Default is 0.5. | |
""" | |
class UserFriendlyMetrics(evaluate.Metric): | |
"""TODO: Short description of my evaluation module.""" | |
def _info(self): | |
# TODO: Specifies the evaluate.EvaluationModuleInfo object | |
return evaluate.MetricInfo( | |
# This is the description that will appear on the modules page. | |
module_type="metric", | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
# This defines the format of each prediction and reference | |
features=datasets.Features( | |
{ | |
"predictions": datasets.Sequence( | |
datasets.Sequence(datasets.Value("float")) | |
), | |
"references": datasets.Sequence( | |
datasets.Sequence(datasets.Value("float")) | |
), | |
} | |
), | |
# Additional links to the codebase or references | |
codebase_urls=["http://github.com/path/to/codebase/of/new_module"], | |
reference_urls=["http://path.to.reference.url/new_module"], | |
) | |
def _download_and_prepare(self, dl_manager): | |
"""Optional: download external resources useful to compute the scores""" | |
# TODO: Download external resources if needed | |
pass | |
def compute_from_payload(self, payload, **kwargs): | |
return self._compute(payload, **kwargs) | |
def _compute( | |
self, | |
payload, | |
max_iou: float = 0.5, | |
filter={}, | |
recognition_thresholds=[0.3, 0.5, 0.8], | |
debug: bool = False, | |
): | |
"""Returns the scores""" | |
# TODO: Compute the different scores of the module | |
return calculate_from_payload( | |
payload, max_iou, filter, recognition_thresholds, debug | |
) | |
# return calculate(predictions, references, max_iou) | |
def wandb( | |
self, | |
results, | |
wandb_section: str = None, | |
wandb_runs = None, | |
wandb_project="user_friendly_metrics", | |
log_plots: bool = True, | |
debug: bool = False, | |
): | |
""" | |
Logs metrics to Weights and Biases (wandb) for tracking and visualization, including categorized bar charts for overall metrics. | |
Args: | |
results (dict): Results dictionary with 'overall' and 'per_sequence' keys. | |
wandb_section (str, optional): W&B section for metric grouping. Defaults to None. | |
wandb_project (str, optional): The name of the wandb project. Defaults to 'user_friendly_metrics'. | |
log_plots (bool, optional): Generates categorized bar charts for overall metrics. Defaults to True. | |
debug (bool, optional): Logs detailed summaries and histories to the terminal console. Defaults to False. | |
""" | |
current_datetime = datetime.datetime.now() | |
formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S") | |
wandb.login(key=os.getenv("WANDB_API_KEY")) | |
if wandb_runs is not None: | |
assert len(wandb_runs) == len(results), "runs and results must have the same length" | |
else: | |
wandb_runs = [f"{i}_{formatted_datetime}" for i in list(results.keys())] | |
for wandb_run_name, result in zip(wandb_runs, results.values()): | |
self.wandb_run(result = result, | |
wandb_run_name = wandb_run_name, | |
wandb_project = wandb_project, | |
debug = debug) | |
def wandb_run(self, result, wandb_run_name, wandb_project, debug, wandb_section = None, log_plots = True): | |
run = wandb.init( | |
project = wandb_project, | |
name = wandb_run_name, | |
reinit = True, | |
settings = wandb.Settings(silent=not debug), | |
) | |
categories = { | |
"user_friendly_metrics": { | |
"mostly_tracked_score_0.3", | |
"mostly_tracked_score_0.5", | |
"mostly_tracked_score_0.8", | |
}, | |
"evaluation_metrics_dev": { | |
"recall", | |
}, | |
"user_friendly_metrics_dev": { | |
"mostly_tracked_count_0.3", | |
"mostly_tracked_count_0.5", | |
"mostly_tracked_count_0.8", | |
"unique_obj_count", | |
}, | |
"predictions_summary": { | |
"tp", | |
"fn", | |
}, | |
} | |
chart_data = {key: [] for key in categories.keys()} | |
# Log overall metrics | |
if "overall" in result: | |
for metric, value in result["overall"]["all"].items(): | |
log_key = ( | |
f"{wandb_section}/overall/{metric}" | |
if wandb_section | |
else f"overall/{metric}" | |
) | |
run.log({log_key: value}) | |
if debug: | |
print(f" {log_key} = {value}") | |
for category, metrics in categories.items(): | |
if metric in metrics: | |
chart_data[category].append([metric, value]) | |
print("----------------------------------------------------") | |
if log_plots: | |
for category, data in chart_data.items(): | |
if data: | |
table_data = [[label, value] for label, value in data] | |
table = wandb.Table(data=table_data, columns=["metrics", "value"]) | |
run.log( | |
{ | |
f"{category}_bar_chart": wandb.plot.bar( | |
table, | |
"metrics", | |
"value", | |
title=f"{category.replace('_', ' ').title()}", | |
) | |
} | |
) | |
if "per_sequence" in result: | |
sorted_sequences = sorted( | |
result["per_sequence"].items(), | |
key=lambda x: next(iter(x[1].values()), {}).get("all", {}).get("f1", 0), | |
reverse=True, # Set to True for descending order | |
) | |
for sequence_name, sequence_data in sorted_sequences: | |
for metric, value in sequence_data["all"].items(): | |
log_key = ( | |
f"{wandb_section}/per_sequence/{sequence_name}/{metric}" | |
if wandb_section | |
else f"per_sequence/{sequence_name}/{metric}" | |
) | |
run.log({log_key: value}) | |
if debug: | |
print(f" {log_key} = {value}") | |
print("----------------------------------------------------") | |
if debug: | |
print("\nDebug Mode: Logging Summary and History") | |
print(f"Results Summary:\n{result}") | |
print(f"WandB Settings:\n{run.settings}") | |
print("All metrics have been logged.") | |
run.finish() | |