Spaces:
Running
Running
Commit
Β·
0232cf1
1
Parent(s):
a8a6326
update
Browse files- app.py +5 -3
- src/bettertransformer.py +21 -21
- src/control_panel.py +31 -23
- src/flashattentionv2.py +22 -22
- src/latency_score_memory.py +12 -12
- src/leaderboard.py +34 -15
- src/llm_perf.py +15 -8
- src/quantization_kernels.py +24 -24
app.py
CHANGED
@@ -35,7 +35,6 @@ with demo:
|
|
35 |
(
|
36 |
filter_button,
|
37 |
machine_textbox,
|
38 |
-
search_bar,
|
39 |
score_slider,
|
40 |
memory_slider,
|
41 |
backend_checkboxes,
|
@@ -48,7 +47,7 @@ with demo:
|
|
48 |
llm_perf_df = get_llm_perf_df(machine=machine)
|
49 |
####################### LEADERBOARD TAB #######################
|
50 |
with gr.TabItem("Leaderboard π
", id=0):
|
51 |
-
|
52 |
lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
|
53 |
####################### BETTERTRANSFORMER SPEEDUP TAB #######################
|
54 |
with gr.TabItem("BetterTransformer π", id=2):
|
@@ -63,14 +62,15 @@ with demo:
|
|
63 |
filter_button,
|
64 |
# inputs
|
65 |
machine_textbox,
|
66 |
-
search_bar,
|
67 |
score_slider,
|
68 |
memory_slider,
|
69 |
backend_checkboxes,
|
70 |
datatype_checkboxes,
|
71 |
optimization_checkboxes,
|
72 |
quantization_checkboxes,
|
|
|
73 |
columns_checkboxes,
|
|
|
74 |
# outputs
|
75 |
leaderboard_table,
|
76 |
lat_score_mem_plot,
|
@@ -85,7 +85,9 @@ with demo:
|
|
85 |
create_select_callback(
|
86 |
# inputs
|
87 |
machine_textbox,
|
|
|
88 |
columns_checkboxes,
|
|
|
89 |
# outputs
|
90 |
leaderboard_table,
|
91 |
)
|
|
|
35 |
(
|
36 |
filter_button,
|
37 |
machine_textbox,
|
|
|
38 |
score_slider,
|
39 |
memory_slider,
|
40 |
backend_checkboxes,
|
|
|
47 |
llm_perf_df = get_llm_perf_df(machine=machine)
|
48 |
####################### LEADERBOARD TAB #######################
|
49 |
with gr.TabItem("Leaderboard π
", id=0):
|
50 |
+
search_bar, columns_checkboxes, leaderboard_table = create_leaderboard_table(llm_perf_df)
|
51 |
lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
|
52 |
####################### BETTERTRANSFORMER SPEEDUP TAB #######################
|
53 |
with gr.TabItem("BetterTransformer π", id=2):
|
|
|
62 |
filter_button,
|
63 |
# inputs
|
64 |
machine_textbox,
|
|
|
65 |
score_slider,
|
66 |
memory_slider,
|
67 |
backend_checkboxes,
|
68 |
datatype_checkboxes,
|
69 |
optimization_checkboxes,
|
70 |
quantization_checkboxes,
|
71 |
+
# interactive
|
72 |
columns_checkboxes,
|
73 |
+
search_bar,
|
74 |
# outputs
|
75 |
leaderboard_table,
|
76 |
lat_score_mem_plot,
|
|
|
85 |
create_select_callback(
|
86 |
# inputs
|
87 |
machine_textbox,
|
88 |
+
# interactive
|
89 |
columns_checkboxes,
|
90 |
+
search_bar,
|
91 |
# outputs
|
92 |
leaderboard_table,
|
93 |
)
|
src/bettertransformer.py
CHANGED
@@ -6,10 +6,10 @@ import plotly.express as px
|
|
6 |
BETTERTRANSFORMER_DATA = [
|
7 |
# open llm
|
8 |
"Model π€",
|
9 |
-
"Arch ποΈ",
|
10 |
"DType π₯",
|
11 |
"Backend π",
|
12 |
"Params (B)",
|
|
|
13 |
"Open LLM Score (%)",
|
14 |
# deployment settings
|
15 |
"DType π₯",
|
@@ -18,15 +18,15 @@ BETTERTRANSFORMER_DATA = [
|
|
18 |
"Quantization ποΈ",
|
19 |
"Optimization π οΈ BetterTransformer",
|
20 |
# primary measurements
|
21 |
-
"Prefill
|
22 |
-
"Prefill
|
23 |
-
"Decode
|
24 |
-
"Decode
|
25 |
-
"
|
26 |
-
"
|
27 |
# speedups
|
28 |
-
"Prefill
|
29 |
-
"Decode
|
30 |
]
|
31 |
|
32 |
|
@@ -43,15 +43,15 @@ def get_bt_df(llm_perf_df):
|
|
43 |
suffixes=["", " BetterTransformer"],
|
44 |
)
|
45 |
# compute speedups
|
46 |
-
bt_df["Prefill
|
47 |
-
(bt_df["Prefill
|
48 |
).round(2) - 100
|
49 |
-
bt_df["Decode
|
50 |
-
(bt_df["Decode
|
51 |
).round(2) - 100
|
52 |
# filter speedups > 1000%
|
53 |
-
bt_df = bt_df[bt_df["Prefill
|
54 |
-
bt_df = bt_df[bt_df["Decode
|
55 |
|
56 |
return bt_df
|
57 |
|
@@ -61,8 +61,8 @@ def get_bt_prefill_fig(llm_perf_df):
|
|
61 |
# plot
|
62 |
prefill_fig = px.box(
|
63 |
bt_df,
|
64 |
-
x="
|
65 |
-
y="Prefill
|
66 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
67 |
custom_data=BETTERTRANSFORMER_DATA,
|
68 |
color="Quantization ποΈ",
|
@@ -77,7 +77,7 @@ def get_bt_prefill_fig(llm_perf_df):
|
|
77 |
# add layout
|
78 |
prefill_fig.update_layout(
|
79 |
title={
|
80 |
-
"text": "Prefill
|
81 |
"y": 0.95,
|
82 |
"x": 0.5,
|
83 |
"xanchor": "center",
|
@@ -98,8 +98,8 @@ def get_bt_decode_fig(llm_perf_df):
|
|
98 |
# plot
|
99 |
decode_fig = px.box(
|
100 |
bt_df,
|
101 |
-
x="
|
102 |
-
y="Decode
|
103 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
104 |
custom_data=BETTERTRANSFORMER_DATA,
|
105 |
color="Quantization ποΈ",
|
@@ -114,7 +114,7 @@ def get_bt_decode_fig(llm_perf_df):
|
|
114 |
# add layout
|
115 |
decode_fig.update_layout(
|
116 |
title={
|
117 |
-
"text": "Decode
|
118 |
"y": 0.95,
|
119 |
"x": 0.5,
|
120 |
"xanchor": "center",
|
|
|
6 |
BETTERTRANSFORMER_DATA = [
|
7 |
# open llm
|
8 |
"Model π€",
|
|
|
9 |
"DType π₯",
|
10 |
"Backend π",
|
11 |
"Params (B)",
|
12 |
+
"Architecture ποΈ",
|
13 |
"Open LLM Score (%)",
|
14 |
# deployment settings
|
15 |
"DType π₯",
|
|
|
18 |
"Quantization ποΈ",
|
19 |
"Optimization π οΈ BetterTransformer",
|
20 |
# primary measurements
|
21 |
+
"Prefill (s)",
|
22 |
+
"Prefill (s) BetterTransformer",
|
23 |
+
"Decode (tokens/s)",
|
24 |
+
"Decode (tokens/s) BetterTransformer",
|
25 |
+
"End-to-End (tokens/s)",
|
26 |
+
"End-to-End (tokens/s) BetterTransformer",
|
27 |
# speedups
|
28 |
+
"Prefill Speedup (%)",
|
29 |
+
"Decode Speedup (%)",
|
30 |
]
|
31 |
|
32 |
|
|
|
43 |
suffixes=["", " BetterTransformer"],
|
44 |
)
|
45 |
# compute speedups
|
46 |
+
bt_df["Prefill Speedup (%)"] = (
|
47 |
+
(bt_df["Prefill (s)"] / bt_df["Prefill (s) BetterTransformer"]) * 100
|
48 |
).round(2) - 100
|
49 |
+
bt_df["Decode Speedup (%)"] = (
|
50 |
+
(bt_df["Decode (tokens/s) BetterTransformer"] / bt_df["Decode (tokens/s)"]) * 100
|
51 |
).round(2) - 100
|
52 |
# filter speedups > 1000%
|
53 |
+
bt_df = bt_df[bt_df["Prefill Speedup (%)"] < 1000]
|
54 |
+
bt_df = bt_df[bt_df["Decode Speedup (%)"] < 1000]
|
55 |
|
56 |
return bt_df
|
57 |
|
|
|
61 |
# plot
|
62 |
prefill_fig = px.box(
|
63 |
bt_df,
|
64 |
+
x="Architecture ποΈ",
|
65 |
+
y="Prefill Speedup (%)",
|
66 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
67 |
custom_data=BETTERTRANSFORMER_DATA,
|
68 |
color="Quantization ποΈ",
|
|
|
77 |
# add layout
|
78 |
prefill_fig.update_layout(
|
79 |
title={
|
80 |
+
"text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
|
81 |
"y": 0.95,
|
82 |
"x": 0.5,
|
83 |
"xanchor": "center",
|
|
|
98 |
# plot
|
99 |
decode_fig = px.box(
|
100 |
bt_df,
|
101 |
+
x="Architecture ποΈ",
|
102 |
+
y="Decode Speedup (%)",
|
103 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
104 |
custom_data=BETTERTRANSFORMER_DATA,
|
105 |
color="Quantization ποΈ",
|
|
|
114 |
# add layout
|
115 |
decode_fig.update_layout(
|
116 |
title={
|
117 |
+
"text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
|
118 |
"y": 0.95,
|
119 |
"x": 0.5,
|
120 |
"xanchor": "center",
|
src/control_panel.py
CHANGED
@@ -12,13 +12,6 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
12 |
# controls
|
13 |
machine_textbox = gr.Textbox(value=machine, visible=False)
|
14 |
with gr.Accordion("Control Panel ποΈ", open=False, elem_id="control-panel"):
|
15 |
-
with gr.Row():
|
16 |
-
with gr.Column():
|
17 |
-
search_bar = gr.Textbox(
|
18 |
-
label="Model π€",
|
19 |
-
info="π Search for a model name",
|
20 |
-
elem_id="search-bar",
|
21 |
-
)
|
22 |
with gr.Row():
|
23 |
with gr.Column(scale=1, variant="panel"):
|
24 |
score_slider = gr.Slider(
|
@@ -98,7 +91,6 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
98 |
return (
|
99 |
filter_button,
|
100 |
machine_textbox,
|
101 |
-
search_bar,
|
102 |
score_slider,
|
103 |
memory_slider,
|
104 |
backend_checkboxes,
|
@@ -110,27 +102,28 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
110 |
|
111 |
def filter_fn(
|
112 |
machine,
|
113 |
-
|
|
|
|
|
114 |
backends,
|
115 |
datatypes,
|
116 |
optimizations,
|
117 |
quantizations,
|
|
|
118 |
columns,
|
119 |
-
|
120 |
-
memory,
|
121 |
):
|
122 |
raw_df = get_llm_perf_df(machine=machine)
|
123 |
filtered_df = raw_df[
|
124 |
-
raw_df["Model π€"].str.contains(model, case=False)
|
125 |
-
|
126 |
& raw_df["DType π₯"].isin(datatypes)
|
127 |
& raw_df["Optimization π οΈ"].isin(optimizations)
|
128 |
& raw_df["Quantization ποΈ"].isin(quantizations)
|
129 |
& (raw_df["Open LLM Score (%)"] >= score)
|
130 |
& (raw_df["Allocated Memory (MB)"] <= memory)
|
131 |
]
|
132 |
-
filtered_leaderboard_df =
|
133 |
-
filtered_leaderboard_df = filtered_leaderboard_df[columns]
|
134 |
filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
|
135 |
filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
|
136 |
filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
|
@@ -154,16 +147,18 @@ def filter_fn(
|
|
154 |
def create_control_callback(
|
155 |
# button
|
156 |
filter_button,
|
157 |
-
#
|
158 |
machine_textbox,
|
159 |
-
|
160 |
score_slider,
|
161 |
memory_slider,
|
162 |
backend_checkboxes,
|
163 |
datatype_checkboxes,
|
164 |
optimization_checkboxes,
|
165 |
quantization_checkboxes,
|
|
|
166 |
columns_checkboxes,
|
|
|
167 |
# outputs
|
168 |
leaderboard_table,
|
169 |
lat_score_mem_plot,
|
@@ -177,15 +172,18 @@ def create_control_callback(
|
|
177 |
filter_button.click(
|
178 |
fn=filter_fn,
|
179 |
inputs=[
|
|
|
180 |
machine_textbox,
|
181 |
-
|
|
|
|
|
182 |
backend_checkboxes,
|
183 |
datatype_checkboxes,
|
184 |
optimization_checkboxes,
|
185 |
quantization_checkboxes,
|
|
|
186 |
columns_checkboxes,
|
187 |
-
|
188 |
-
memory_slider,
|
189 |
],
|
190 |
outputs=[
|
191 |
leaderboard_table,
|
@@ -200,23 +198,33 @@ def create_control_callback(
|
|
200 |
)
|
201 |
|
202 |
|
203 |
-
def select_fn(machine, columns):
|
204 |
raw_df = get_llm_perf_df(machine=machine)
|
205 |
selected_leaderboard_df = get_leaderboard_df(raw_df)
|
206 |
selected_leaderboard_df = selected_leaderboard_df[columns]
|
|
|
|
|
|
|
207 |
|
208 |
return selected_leaderboard_df
|
209 |
|
210 |
|
211 |
def create_select_callback(
|
212 |
-
#
|
213 |
machine_textbox,
|
|
|
214 |
columns_checkboxes,
|
|
|
215 |
# outputs
|
216 |
leaderboard_table,
|
217 |
):
|
218 |
columns_checkboxes.change(
|
219 |
fn=select_fn,
|
220 |
-
inputs=[machine_textbox, columns_checkboxes],
|
|
|
|
|
|
|
|
|
|
|
221 |
outputs=[leaderboard_table],
|
222 |
)
|
|
|
12 |
# controls
|
13 |
machine_textbox = gr.Textbox(value=machine, visible=False)
|
14 |
with gr.Accordion("Control Panel ποΈ", open=False, elem_id="control-panel"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
with gr.Row():
|
16 |
with gr.Column(scale=1, variant="panel"):
|
17 |
score_slider = gr.Slider(
|
|
|
91 |
return (
|
92 |
filter_button,
|
93 |
machine_textbox,
|
|
|
94 |
score_slider,
|
95 |
memory_slider,
|
96 |
backend_checkboxes,
|
|
|
102 |
|
103 |
def filter_fn(
|
104 |
machine,
|
105 |
+
# inputs
|
106 |
+
score,
|
107 |
+
memory,
|
108 |
backends,
|
109 |
datatypes,
|
110 |
optimizations,
|
111 |
quantizations,
|
112 |
+
# interactive
|
113 |
columns,
|
114 |
+
search,
|
|
|
115 |
):
|
116 |
raw_df = get_llm_perf_df(machine=machine)
|
117 |
filtered_df = raw_df[
|
118 |
+
# raw_df["Model π€"].str.contains(model, case=False)
|
119 |
+
raw_df["Backend π"].isin(backends)
|
120 |
& raw_df["DType π₯"].isin(datatypes)
|
121 |
& raw_df["Optimization π οΈ"].isin(optimizations)
|
122 |
& raw_df["Quantization ποΈ"].isin(quantizations)
|
123 |
& (raw_df["Open LLM Score (%)"] >= score)
|
124 |
& (raw_df["Allocated Memory (MB)"] <= memory)
|
125 |
]
|
126 |
+
filtered_leaderboard_df = select_fn(machine, columns, search)
|
|
|
127 |
filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
|
128 |
filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
|
129 |
filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
|
|
|
147 |
def create_control_callback(
|
148 |
# button
|
149 |
filter_button,
|
150 |
+
# fixed
|
151 |
machine_textbox,
|
152 |
+
# inputs
|
153 |
score_slider,
|
154 |
memory_slider,
|
155 |
backend_checkboxes,
|
156 |
datatype_checkboxes,
|
157 |
optimization_checkboxes,
|
158 |
quantization_checkboxes,
|
159 |
+
# interactive
|
160 |
columns_checkboxes,
|
161 |
+
search_bar,
|
162 |
# outputs
|
163 |
leaderboard_table,
|
164 |
lat_score_mem_plot,
|
|
|
172 |
filter_button.click(
|
173 |
fn=filter_fn,
|
174 |
inputs=[
|
175 |
+
# fixed
|
176 |
machine_textbox,
|
177 |
+
# inputs
|
178 |
+
score_slider,
|
179 |
+
memory_slider,
|
180 |
backend_checkboxes,
|
181 |
datatype_checkboxes,
|
182 |
optimization_checkboxes,
|
183 |
quantization_checkboxes,
|
184 |
+
# interactive
|
185 |
columns_checkboxes,
|
186 |
+
search_bar,
|
|
|
187 |
],
|
188 |
outputs=[
|
189 |
leaderboard_table,
|
|
|
198 |
)
|
199 |
|
200 |
|
201 |
+
def select_fn(machine, columns, search):
|
202 |
raw_df = get_llm_perf_df(machine=machine)
|
203 |
selected_leaderboard_df = get_leaderboard_df(raw_df)
|
204 |
selected_leaderboard_df = selected_leaderboard_df[columns]
|
205 |
+
selected_leaderboard_df = selected_leaderboard_df[
|
206 |
+
selected_leaderboard_df["Model π€"].str.contains(search, case=False)
|
207 |
+
]
|
208 |
|
209 |
return selected_leaderboard_df
|
210 |
|
211 |
|
212 |
def create_select_callback(
|
213 |
+
# fixed
|
214 |
machine_textbox,
|
215 |
+
# interactive
|
216 |
columns_checkboxes,
|
217 |
+
search_bar,
|
218 |
# outputs
|
219 |
leaderboard_table,
|
220 |
):
|
221 |
columns_checkboxes.change(
|
222 |
fn=select_fn,
|
223 |
+
inputs=[machine_textbox, columns_checkboxes, search_bar],
|
224 |
+
outputs=[leaderboard_table],
|
225 |
+
)
|
226 |
+
search_bar.change(
|
227 |
+
fn=select_fn,
|
228 |
+
inputs=[machine_textbox, columns_checkboxes, search_bar],
|
229 |
outputs=[leaderboard_table],
|
230 |
)
|
src/flashattentionv2.py
CHANGED
@@ -6,10 +6,10 @@ import plotly.express as px
|
|
6 |
FLASHATTENTIONV2_DATA = [
|
7 |
# open llm
|
8 |
"Model π€",
|
9 |
-
"Arch ποΈ",
|
10 |
"DType π₯",
|
11 |
"Backend π",
|
12 |
"Params (B)",
|
|
|
13 |
"Open LLM Score (%)",
|
14 |
# deployment settings
|
15 |
"DType π₯",
|
@@ -18,15 +18,15 @@ FLASHATTENTIONV2_DATA = [
|
|
18 |
"Quantization ποΈ",
|
19 |
"Optimization π οΈ FlashAttentionV2",
|
20 |
# primary measurements
|
21 |
-
"Prefill
|
22 |
-
"Prefill
|
23 |
-
"Decode
|
24 |
-
"Decode
|
25 |
-
"
|
26 |
-
"
|
27 |
# speedups
|
28 |
-
"Prefill
|
29 |
-
"Decode
|
30 |
]
|
31 |
|
32 |
|
@@ -43,15 +43,15 @@ def get_fa2_df(llm_perf_df):
|
|
43 |
suffixes=["", " FlashAttentionV2"],
|
44 |
)
|
45 |
# compute speedups
|
46 |
-
fa2_df["Prefill
|
47 |
-
|
48 |
-
)
|
49 |
-
fa2_df["Decode
|
50 |
-
(fa2_df["Decode
|
51 |
).round(2) - 100
|
52 |
# filter speedups > 1000%
|
53 |
-
fa2_df = fa2_df[fa2_df["Prefill
|
54 |
-
fa2_df = fa2_df[fa2_df["Decode
|
55 |
|
56 |
return fa2_df
|
57 |
|
@@ -61,8 +61,8 @@ def get_fa2_decode_fig(llm_perf_df):
|
|
61 |
# plot
|
62 |
decode_fig = px.box(
|
63 |
fa2_df,
|
64 |
-
x="
|
65 |
-
y="Decode
|
66 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
67 |
custom_data=FLASHATTENTIONV2_DATA,
|
68 |
color="Quantization ποΈ",
|
@@ -77,7 +77,7 @@ def get_fa2_decode_fig(llm_perf_df):
|
|
77 |
# add layout
|
78 |
decode_fig.update_layout(
|
79 |
title={
|
80 |
-
"text": "Decode
|
81 |
"y": 0.95,
|
82 |
"x": 0.5,
|
83 |
"xanchor": "center",
|
@@ -98,8 +98,8 @@ def get_fa2_prefill_fig(llm_perf_df):
|
|
98 |
# plot
|
99 |
prefill_fig = px.box(
|
100 |
fa2_df,
|
101 |
-
x="
|
102 |
-
y="Prefill
|
103 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
104 |
custom_data=FLASHATTENTIONV2_DATA,
|
105 |
color="Quantization ποΈ",
|
@@ -114,7 +114,7 @@ def get_fa2_prefill_fig(llm_perf_df):
|
|
114 |
# add layout
|
115 |
prefill_fig.update_layout(
|
116 |
title={
|
117 |
-
"text": "Prefill
|
118 |
"y": 0.95,
|
119 |
"x": 0.5,
|
120 |
"xanchor": "center",
|
|
|
6 |
FLASHATTENTIONV2_DATA = [
|
7 |
# open llm
|
8 |
"Model π€",
|
|
|
9 |
"DType π₯",
|
10 |
"Backend π",
|
11 |
"Params (B)",
|
12 |
+
"Architecture ποΈ",
|
13 |
"Open LLM Score (%)",
|
14 |
# deployment settings
|
15 |
"DType π₯",
|
|
|
18 |
"Quantization ποΈ",
|
19 |
"Optimization π οΈ FlashAttentionV2",
|
20 |
# primary measurements
|
21 |
+
"Prefill (s)",
|
22 |
+
"Prefill (s) FlashAttentionV2",
|
23 |
+
"Decode (tokens/s)",
|
24 |
+
"Decode (tokens/s) FlashAttentionV2",
|
25 |
+
"End-to-End (tokens/s)",
|
26 |
+
"End-to-End (tokens/s) FlashAttentionV2",
|
27 |
# speedups
|
28 |
+
"Prefill Speedup (%)",
|
29 |
+
"Decode Speedup (%)",
|
30 |
]
|
31 |
|
32 |
|
|
|
43 |
suffixes=["", " FlashAttentionV2"],
|
44 |
)
|
45 |
# compute speedups
|
46 |
+
fa2_df["Prefill Speedup (%)"] = ((fa2_df["Prefill (s)"] / fa2_df["Prefill (s) FlashAttentionV2"]) * 100).round(
|
47 |
+
2
|
48 |
+
) - 100
|
49 |
+
fa2_df["Decode Speedup (%)"] = (
|
50 |
+
(fa2_df["Decode (tokens/s) FlashAttentionV2"] / fa2_df["Decode (tokens/s)"]) * 100
|
51 |
).round(2) - 100
|
52 |
# filter speedups > 1000%
|
53 |
+
fa2_df = fa2_df[fa2_df["Prefill Speedup (%)"] < 1000]
|
54 |
+
fa2_df = fa2_df[fa2_df["Decode Speedup (%)"] < 1000]
|
55 |
|
56 |
return fa2_df
|
57 |
|
|
|
61 |
# plot
|
62 |
decode_fig = px.box(
|
63 |
fa2_df,
|
64 |
+
x="Architecture ποΈ",
|
65 |
+
y="Decode Speedup (%)",
|
66 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
67 |
custom_data=FLASHATTENTIONV2_DATA,
|
68 |
color="Quantization ποΈ",
|
|
|
77 |
# add layout
|
78 |
decode_fig.update_layout(
|
79 |
title={
|
80 |
+
"text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
|
81 |
"y": 0.95,
|
82 |
"x": 0.5,
|
83 |
"xanchor": "center",
|
|
|
98 |
# plot
|
99 |
prefill_fig = px.box(
|
100 |
fa2_df,
|
101 |
+
x="Architecture ποΈ",
|
102 |
+
y="Prefill Speedup (%)",
|
103 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
104 |
custom_data=FLASHATTENTIONV2_DATA,
|
105 |
color="Quantization ποΈ",
|
|
|
114 |
# add layout
|
115 |
prefill_fig.update_layout(
|
116 |
title={
|
117 |
+
"text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
|
118 |
"y": 0.95,
|
119 |
"x": 0.5,
|
120 |
"xanchor": "center",
|
src/latency_score_memory.py
CHANGED
@@ -4,18 +4,18 @@ import plotly.express as px
|
|
4 |
|
5 |
SCORE_MEMORY_LATENCY_DATA = [
|
6 |
"Model π€",
|
7 |
-
"Arch ποΈ",
|
8 |
-
"Params (B)",
|
9 |
"DType π₯",
|
10 |
"Backend π",
|
|
|
|
|
11 |
"Optimization π οΈ",
|
12 |
"Quantization ποΈ",
|
13 |
"Open LLM Score (%)",
|
14 |
-
"Prefill
|
15 |
-
"Decode
|
16 |
-
"
|
17 |
-
"
|
18 |
-
# "
|
19 |
]
|
20 |
|
21 |
|
@@ -24,10 +24,10 @@ def get_lat_score_mem_fig(llm_perf_df):
|
|
24 |
# plot
|
25 |
fig = px.scatter(
|
26 |
copy_df,
|
27 |
-
x="
|
28 |
y="Open LLM Score (%)",
|
29 |
-
size="
|
30 |
-
color="
|
31 |
custom_data=SCORE_MEMORY_LATENCY_DATA,
|
32 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
33 |
)
|
@@ -38,7 +38,7 @@ def get_lat_score_mem_fig(llm_perf_df):
|
|
38 |
)
|
39 |
fig.update_layout(
|
40 |
title={
|
41 |
-
"text": "
|
42 |
"y": 0.95,
|
43 |
"x": 0.5,
|
44 |
"xanchor": "center",
|
@@ -56,7 +56,7 @@ def get_lat_score_mem_fig(llm_perf_df):
|
|
56 |
|
57 |
def create_lat_score_mem_plot(llm_perf_df):
|
58 |
# descriptive text
|
59 |
-
gr.HTML("π Hover over the points π for additional information. ",elem_id="text")
|
60 |
# get figure
|
61 |
fig = get_lat_score_mem_fig(llm_perf_df)
|
62 |
# create plot
|
|
|
4 |
|
5 |
SCORE_MEMORY_LATENCY_DATA = [
|
6 |
"Model π€",
|
|
|
|
|
7 |
"DType π₯",
|
8 |
"Backend π",
|
9 |
+
"Params (B)",
|
10 |
+
"Architecture ποΈ",
|
11 |
"Optimization π οΈ",
|
12 |
"Quantization ποΈ",
|
13 |
"Open LLM Score (%)",
|
14 |
+
"Prefill (s)",
|
15 |
+
"Decode (tokens/s)",
|
16 |
+
"Memory (MB)",
|
17 |
+
"End-to-End (s)",
|
18 |
+
# "End-to-End (tokens/s)",
|
19 |
]
|
20 |
|
21 |
|
|
|
24 |
# plot
|
25 |
fig = px.scatter(
|
26 |
copy_df,
|
27 |
+
x="End-to-End (s)",
|
28 |
y="Open LLM Score (%)",
|
29 |
+
size="Memory (MB)",
|
30 |
+
color="Architecture ποΈ",
|
31 |
custom_data=SCORE_MEMORY_LATENCY_DATA,
|
32 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
33 |
)
|
|
|
38 |
)
|
39 |
fig.update_layout(
|
40 |
title={
|
41 |
+
"text": "vs. Score vs. Memory",
|
42 |
"y": 0.95,
|
43 |
"x": 0.5,
|
44 |
"xanchor": "center",
|
|
|
56 |
|
57 |
def create_lat_score_mem_plot(llm_perf_df):
|
58 |
# descriptive text
|
59 |
+
gr.HTML("π Hover over the points π for additional information. ", elem_id="text")
|
60 |
# get figure
|
61 |
fig = get_lat_score_mem_fig(llm_perf_df)
|
62 |
# create plot
|
src/leaderboard.py
CHANGED
@@ -8,9 +8,9 @@ LEADERBOARD_COLUMN_TO_DATATYPE = {
|
|
8 |
"Model π€": "markdown",
|
9 |
"Experiment π§ͺ": "str",
|
10 |
# primary measurements
|
11 |
-
"Prefill
|
12 |
-
"Decode
|
13 |
-
"
|
14 |
"Energy (tokens/kWh)": "number",
|
15 |
# deployment settings
|
16 |
"DType π₯": "str",
|
@@ -18,15 +18,25 @@ LEADERBOARD_COLUMN_TO_DATATYPE = {
|
|
18 |
"Optimization π οΈ": "str",
|
19 |
"Quantization ποΈ": "str",
|
20 |
# additional measurements
|
21 |
-
"
|
22 |
"Params (B)": "number",
|
23 |
"Open LLM Score (%)": "number",
|
24 |
-
"
|
25 |
-
"
|
26 |
"Reserved Memory (MB)": "number",
|
27 |
"Used Memory (MB)": "number",
|
28 |
}
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def process_model(model_name):
|
32 |
link = f"https://huggingface.co/{model_name}"
|
@@ -48,20 +58,29 @@ def get_leaderboard_df(llm_perf_df):
|
|
48 |
def create_leaderboard_table(llm_perf_df):
|
49 |
# get dataframe
|
50 |
leaderboard_df = get_leaderboard_df(llm_perf_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
# create checkboxes
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
59 |
# create table
|
60 |
leaderboard_table = gr.components.Dataframe(
|
61 |
-
value=leaderboard_df,
|
62 |
datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
|
63 |
headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
|
64 |
elem_id="leaderboard-table",
|
65 |
)
|
66 |
|
67 |
-
return
|
|
|
8 |
"Model π€": "markdown",
|
9 |
"Experiment π§ͺ": "str",
|
10 |
# primary measurements
|
11 |
+
"Prefill (s)": "number",
|
12 |
+
"Decode (tokens/s)": "number",
|
13 |
+
"Memory (MB)": "number",
|
14 |
"Energy (tokens/kWh)": "number",
|
15 |
# deployment settings
|
16 |
"DType π₯": "str",
|
|
|
18 |
"Optimization π οΈ": "str",
|
19 |
"Quantization ποΈ": "str",
|
20 |
# additional measurements
|
21 |
+
"Architecture ποΈ": "markdown",
|
22 |
"Params (B)": "number",
|
23 |
"Open LLM Score (%)": "number",
|
24 |
+
"End-to-End (s)": "number",
|
25 |
+
"End-to-End (tokens/s)": "number",
|
26 |
"Reserved Memory (MB)": "number",
|
27 |
"Used Memory (MB)": "number",
|
28 |
}
|
29 |
|
30 |
+
PRIMARY_COLUMNS = [
|
31 |
+
"Model π€",
|
32 |
+
"Experiment π§ͺ",
|
33 |
+
"Prefill (s)",
|
34 |
+
"Decode (tokens/s)",
|
35 |
+
"Memory (MB)",
|
36 |
+
"Energy (tokens/kWh)",
|
37 |
+
"Open LLM Score (%)",
|
38 |
+
]
|
39 |
+
|
40 |
|
41 |
def process_model(model_name):
|
42 |
link = f"https://huggingface.co/{model_name}"
|
|
|
58 |
def create_leaderboard_table(llm_perf_df):
|
59 |
# get dataframe
|
60 |
leaderboard_df = get_leaderboard_df(llm_perf_df)
|
61 |
+
|
62 |
+
# create search bar
|
63 |
+
with gr.Row():
|
64 |
+
search_bar = gr.Textbox(
|
65 |
+
label="Model π€",
|
66 |
+
info="π Search for a model name",
|
67 |
+
elem_id="search-bar",
|
68 |
+
)
|
69 |
# create checkboxes
|
70 |
+
with gr.Row():
|
71 |
+
columns_checkboxes = gr.CheckboxGroup(
|
72 |
+
label="Columns π",
|
73 |
+
value=PRIMARY_COLUMNS,
|
74 |
+
choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
|
75 |
+
info="βοΈ Select the columns to display",
|
76 |
+
elem_id="columns-checkboxes",
|
77 |
+
)
|
78 |
# create table
|
79 |
leaderboard_table = gr.components.Dataframe(
|
80 |
+
value=leaderboard_df[PRIMARY_COLUMNS],
|
81 |
datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
|
82 |
headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
|
83 |
elem_id="leaderboard-table",
|
84 |
)
|
85 |
|
86 |
+
return search_bar, columns_checkboxes, leaderboard_table
|
src/llm_perf.py
CHANGED
@@ -12,9 +12,9 @@ COLUMNS_MAPPING = {
|
|
12 |
"Model": "Model π€",
|
13 |
"experiment_name": "Experiment π§ͺ",
|
14 |
# primary measurements
|
15 |
-
"forward.latency(s)": "Prefill
|
16 |
-
"decode.throughput(tokens/s)": "Decode
|
17 |
-
"generate.max_memory_allocated(MB)": "
|
18 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
|
19 |
# deployment settings
|
20 |
"backend.name": "Backend π",
|
@@ -22,18 +22,18 @@ COLUMNS_MAPPING = {
|
|
22 |
"optimization": "Optimization π οΈ",
|
23 |
"quantization": "Quantization ποΈ",
|
24 |
# additional measurements
|
25 |
-
"Arch": "Arch ποΈ",
|
26 |
"Size": "Params (B)",
|
|
|
27 |
"Score": "Open LLM Score (%)",
|
28 |
-
"generate.latency(s)": "
|
29 |
-
"generate.throughput(tokens/s)": "
|
30 |
"generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
|
31 |
"generate.max_memory_used(MB)": "Used Memory (MB)",
|
32 |
}
|
33 |
SORTING_COLUMNS = [
|
34 |
"Open LLM Score (%)",
|
35 |
-
"
|
36 |
-
"
|
37 |
]
|
38 |
SORTING_ASCENDING = [False, True, False]
|
39 |
|
@@ -107,6 +107,13 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
|
|
107 |
].apply(lambda x: process_quantization_scheme(x), axis=1)
|
108 |
# process experiment name
|
109 |
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
# add arch
|
111 |
llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
|
112 |
# filter columns
|
|
|
12 |
"Model": "Model π€",
|
13 |
"experiment_name": "Experiment π§ͺ",
|
14 |
# primary measurements
|
15 |
+
"forward.latency(s)": "Prefill (s)",
|
16 |
+
"decode.throughput(tokens/s)": "Decode (tokens/s)",
|
17 |
+
"generate.max_memory_allocated(MB)": "Memory (MB)",
|
18 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
|
19 |
# deployment settings
|
20 |
"backend.name": "Backend π",
|
|
|
22 |
"optimization": "Optimization π οΈ",
|
23 |
"quantization": "Quantization ποΈ",
|
24 |
# additional measurements
|
|
|
25 |
"Size": "Params (B)",
|
26 |
+
"Arch": "Architecture ποΈ",
|
27 |
"Score": "Open LLM Score (%)",
|
28 |
+
"generate.latency(s)": "End-to-End (s)",
|
29 |
+
"generate.throughput(tokens/s)": "End-to-End (tokens/s)",
|
30 |
"generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
|
31 |
"generate.max_memory_used(MB)": "Used Memory (MB)",
|
32 |
}
|
33 |
SORTING_COLUMNS = [
|
34 |
"Open LLM Score (%)",
|
35 |
+
"Decode (tokens/s)",
|
36 |
+
"Prefill (s)",
|
37 |
]
|
38 |
SORTING_ASCENDING = [False, True, False]
|
39 |
|
|
|
107 |
].apply(lambda x: process_quantization_scheme(x), axis=1)
|
108 |
# process experiment name
|
109 |
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
|
110 |
+
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(
|
111 |
+
lambda x: x.replace("float16+", "").replace("float32+", "").replace("bfloat16+", "") if "bit" in x else x
|
112 |
+
)
|
113 |
+
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("awq-4bit", "awq"))
|
114 |
+
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("gptq-4bit", "gptq"))
|
115 |
+
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bettertransformer", "sdpa"))
|
116 |
+
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("flash-attention-v2", "FA2"))
|
117 |
# add arch
|
118 |
llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
|
119 |
# filter columns
|
src/quantization_kernels.py
CHANGED
@@ -6,10 +6,10 @@ import plotly.express as px
|
|
6 |
QUANT_DATA = [
|
7 |
# open llm
|
8 |
"Model π€",
|
9 |
-
"Arch ποΈ",
|
10 |
"DType π₯",
|
11 |
"Backend π",
|
12 |
"Params (B)",
|
|
|
13 |
"Open LLM Score (%)",
|
14 |
# deployment settings
|
15 |
"DType π₯",
|
@@ -19,13 +19,13 @@ QUANT_DATA = [
|
|
19 |
"Optimization π οΈ Custom Kernel",
|
20 |
"Quantization ποΈ Custom Kernel",
|
21 |
# primary measurements
|
22 |
-
"Prefill
|
23 |
-
"Prefill
|
24 |
-
"Decode
|
25 |
-
"Decode
|
26 |
# speedups
|
27 |
-
"Prefill
|
28 |
-
"Decode
|
29 |
]
|
30 |
|
31 |
|
@@ -33,10 +33,10 @@ def get_quant_df(llm_perf_df):
|
|
33 |
copy_df = llm_perf_df.copy()
|
34 |
# seperate vanilla GPTQ experiments from Custom Kernel experiments
|
35 |
vanilla_df = copy_df[
|
36 |
-
(copy_df["Backend π"] == "pytorch")
|
37 |
-
(copy_df["Quantization ποΈ"] == "None")
|
38 |
-
(copy_df["Optimization π οΈ"] == "None")
|
39 |
-
(copy_df["DType π₯"] == "float16")
|
40 |
]
|
41 |
exllamav1_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV1")]
|
42 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
@@ -70,15 +70,15 @@ def get_quant_df(llm_perf_df):
|
|
70 |
# concat the two dataframes row-wise
|
71 |
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
|
72 |
# compute speedups
|
73 |
-
quant_df["Prefill
|
74 |
-
|
75 |
-
)
|
76 |
-
quant_df["Decode
|
77 |
-
(quant_df["Decode
|
78 |
).round(2) - 100
|
79 |
# filter speedups > 1000%
|
80 |
-
quant_df = quant_df[quant_df["Prefill
|
81 |
-
quant_df = quant_df[quant_df["Decode
|
82 |
|
83 |
return quant_df
|
84 |
|
@@ -88,8 +88,8 @@ def get_quant_decode_fig(llm_perf_df):
|
|
88 |
# plot
|
89 |
decode_fig = px.box(
|
90 |
quant_df,
|
91 |
-
x="
|
92 |
-
y="Decode
|
93 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
94 |
custom_data=QUANT_DATA,
|
95 |
color="Quantization ποΈ Custom Kernel",
|
@@ -102,7 +102,7 @@ def get_quant_decode_fig(llm_perf_df):
|
|
102 |
# add layout
|
103 |
decode_fig.update_layout(
|
104 |
title={
|
105 |
-
"text": "Decode
|
106 |
"y": 0.95,
|
107 |
"x": 0.5,
|
108 |
"xanchor": "center",
|
@@ -123,8 +123,8 @@ def get_quant_prefill_fig(llm_perf_df):
|
|
123 |
# plot
|
124 |
prefill_fig = px.box(
|
125 |
quant_df,
|
126 |
-
x="
|
127 |
-
y="Prefill
|
128 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
129 |
custom_data=QUANT_DATA,
|
130 |
color="Quantization ποΈ Custom Kernel",
|
@@ -137,7 +137,7 @@ def get_quant_prefill_fig(llm_perf_df):
|
|
137 |
# add layout
|
138 |
prefill_fig.update_layout(
|
139 |
title={
|
140 |
-
"text": "Prefill
|
141 |
"y": 0.95,
|
142 |
"x": 0.5,
|
143 |
"xanchor": "center",
|
|
|
6 |
QUANT_DATA = [
|
7 |
# open llm
|
8 |
"Model π€",
|
|
|
9 |
"DType π₯",
|
10 |
"Backend π",
|
11 |
"Params (B)",
|
12 |
+
"Architecture ποΈ",
|
13 |
"Open LLM Score (%)",
|
14 |
# deployment settings
|
15 |
"DType π₯",
|
|
|
19 |
"Optimization π οΈ Custom Kernel",
|
20 |
"Quantization ποΈ Custom Kernel",
|
21 |
# primary measurements
|
22 |
+
"Prefill (s)",
|
23 |
+
"Prefill (s) Custom Kernel",
|
24 |
+
"Decode (tokens/s)",
|
25 |
+
"Decode (tokens/s) Custom Kernel",
|
26 |
# speedups
|
27 |
+
"Prefill Speedup (%)",
|
28 |
+
"Decode Speedup (%)",
|
29 |
]
|
30 |
|
31 |
|
|
|
33 |
copy_df = llm_perf_df.copy()
|
34 |
# seperate vanilla GPTQ experiments from Custom Kernel experiments
|
35 |
vanilla_df = copy_df[
|
36 |
+
(copy_df["Backend π"] == "pytorch")
|
37 |
+
& (copy_df["Quantization ποΈ"] == "None")
|
38 |
+
& (copy_df["Optimization π οΈ"] == "None")
|
39 |
+
& (copy_df["DType π₯"] == "float16")
|
40 |
]
|
41 |
exllamav1_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV1")]
|
42 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
|
|
70 |
# concat the two dataframes row-wise
|
71 |
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
|
72 |
# compute speedups
|
73 |
+
quant_df["Prefill Speedup (%)"] = ((quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100).round(
|
74 |
+
2
|
75 |
+
) - 100
|
76 |
+
quant_df["Decode Speedup (%)"] = (
|
77 |
+
(quant_df["Decode (tokens/s) Custom Kernel"] / quant_df["Decode (tokens/s)"]) * 100
|
78 |
).round(2) - 100
|
79 |
# filter speedups > 1000%
|
80 |
+
quant_df = quant_df[quant_df["Prefill Speedup (%)"] < 1000]
|
81 |
+
quant_df = quant_df[quant_df["Decode Speedup (%)"] < 1000]
|
82 |
|
83 |
return quant_df
|
84 |
|
|
|
88 |
# plot
|
89 |
decode_fig = px.box(
|
90 |
quant_df,
|
91 |
+
x="Architecture ποΈ",
|
92 |
+
y="Decode Speedup (%)",
|
93 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
94 |
custom_data=QUANT_DATA,
|
95 |
color="Quantization ποΈ Custom Kernel",
|
|
|
102 |
# add layout
|
103 |
decode_fig.update_layout(
|
104 |
title={
|
105 |
+
"text": "Decode Speedup per Architecture",
|
106 |
"y": 0.95,
|
107 |
"x": 0.5,
|
108 |
"xanchor": "center",
|
|
|
123 |
# plot
|
124 |
prefill_fig = px.box(
|
125 |
quant_df,
|
126 |
+
x="Architecture ποΈ",
|
127 |
+
y="Prefill Speedup (%)",
|
128 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
129 |
custom_data=QUANT_DATA,
|
130 |
color="Quantization ποΈ Custom Kernel",
|
|
|
137 |
# add layout
|
138 |
prefill_fig.update_layout(
|
139 |
title={
|
140 |
+
"text": "Prefill Speedup per Architecture",
|
141 |
"y": 0.95,
|
142 |
"x": 0.5,
|
143 |
"xanchor": "center",
|