Spaces:
Running
Running
File size: 5,320 Bytes
ab5f5f1 e47d0b2 ab5f5f1 76b423c ab5f5f1 8e30a31 ab5f5f1 76b423c ab5f5f1 76b423c a1f6c2e 76b423c a1f6c2e 76b423c ab5f5f1 76b423c ab5f5f1 e37cee2 76b423c e47d0b2 e37cee2 76b423c e37cee2 ab5f5f1 7ecfa5a 76b423c ab5f5f1 76b423c 7ecfa5a ab5f5f1 76b423c ab5f5f1 76b423c ab5f5f1 76b423c 5345cba 76b423c ab5f5f1 76b423c 0232cf1 ab5f5f1 76b423c e37cee2 8e30a31 e37cee2 76b423c e37cee2 76b423c e37cee2 76b423c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import os
from typing import List
import pandas as pd
from .utils import process_kernels, process_quantizations
DATASET_DIRECTORY = "dataset"
COLUMNS_MAPPING = {
"config.name": "Experiment π§ͺ",
"config.backend.model": "Model π€",
# primary measurements
"report.prefill.latency.p50": "Prefill (s)",
"report.per_token.latency.p50": "Per Token (s)",
"report.decode.throughput.value": "Decode (tokens/s)",
"report.decode.efficiency.value": "Energy (tokens/kWh)",
"report.decode.memory.max_allocated": "Memory (MB)",
# deployment settings
"config.backend.name": "Backend π",
"config.backend.torch_dtype": "Precision π₯",
"quantization": "Quantization ποΈ",
"attention": "Attention ποΈ",
"kernel": "Kernel βοΈ",
# additional information
"architecture": "Architecture ποΈ",
"prefill+decode": "End-to-End (s)",
"Average β¬οΈ": "Open LLM Score (%)",
"#Params (B)": "Params (B)",
}
SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
SORTING_ASCENDING = [False, True, False]
def get_raw_llm_perf_df(
machine: str, subsets: List[str], backends: List[str], hardware_type: str
):
dfs = []
for subset in subsets:
for backend in backends:
try:
dfs.append(
pd.read_csv(
f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{backend}-{hardware_type}-{subset}-{machine}.csv"
)
)
except Exception:
print("Dataset not found for:")
print(f" β’ Backend: {backend}")
print(f" β’ Subset: {subset}")
print(f" β’ Machine: {machine}")
print(f" β’ Hardware Type: {hardware_type}")
url = f"https://huggingface.co/datasets/optimum-benchmark/llm-perf-leaderboard/blob/main/perf-df-{backend}-{hardware_type}-{subset}-{machine}.csv"
print(f" β’ URL: {url}")
if len(dfs) == 0:
raise ValueError(
f"No datasets found for machine {machine}, check your hardware.yml config file or your datatset on huggingface"
)
perf_df = pd.concat(dfs)
llm_df = pd.read_csv(
"hf://datasets/optimum-benchmark/llm-perf-leaderboard/llm-df.csv"
)
llm_perf_df = pd.merge(
llm_df, perf_df, left_on="Model", right_on="config.backend.model"
)
return llm_perf_df
def processed_llm_perf_df(llm_perf_df):
# some assertions
assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
assert llm_perf_df["config.scenario.generate_kwargs.max_new_tokens"].nunique() == 1
assert llm_perf_df["config.scenario.generate_kwargs.min_new_tokens"].nunique() == 1
# fix couple stuff
llm_perf_df.dropna(subset=["report.decode.latency.p50"], inplace=True)
llm_perf_df["config.name"] = llm_perf_df["config.name"].str.replace(
"flash_attention_2", "fa2"
)
llm_perf_df["prefill+decode"] = (
llm_perf_df["report.prefill.latency.p50"]
+ (llm_perf_df["report.decode.latency.p50"])
)
# llm_perf_df["architecture"] = llm_perf_df["config.backend.model"].apply(
# process_architectures
# )
llm_perf_df["architecture"] = llm_perf_df["Architecture"]
llm_perf_df["attention"] = (
llm_perf_df["config.backend.attn_implementation"]
.str.replace("flash_attention_2", "FAv2")
.str.replace("eager", "Eager")
.str.replace("sdpa", "SDPA")
)
llm_perf_df["quantization"] = llm_perf_df.apply(process_quantizations, axis=1)
llm_perf_df["kernel"] = llm_perf_df.apply(process_kernels, axis=1)
# round numerical columns
llm_perf_df = llm_perf_df.round(
{
"report.prefill.latency.p50": 3,
"report.decode.latency.p50": 3,
"report.decode.throughput.value": 3,
"report.decode.efficiency.value": 3,
"report.decode.memory.max_allocated": 3,
"Average β¬οΈ": 3,
"prefill+decode": 3,
"#Params (B)": 3,
}
)
# filter columns
llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
# rename columns
llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
# sort by metric
llm_perf_df.sort_values(
by=SORTING_COLUMNS,
ascending=SORTING_ASCENDING,
inplace=True,
)
return llm_perf_df
def get_llm_perf_df(
machine: str, subsets: List[str], backends: List[str], hardware_type: str
):
if not os.path.exists(DATASET_DIRECTORY):
os.makedirs(DATASET_DIRECTORY)
if os.path.exists(f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv"):
llm_perf_df = pd.read_csv(
f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv"
)
else:
print(f"Dataset machine {machine} not found, downloading...")
llm_perf_df = get_raw_llm_perf_df(machine, subsets, backends, hardware_type)
llm_perf_df = processed_llm_perf_df(llm_perf_df)
llm_perf_df.to_csv(
f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
)
return llm_perf_df
|