IlyasMoutawwakil HF staff commited on
Commit
08604d0
Β·
1 Parent(s): affd732
Files changed (5) hide show
  1. src/content.py +1 -1
  2. src/control_panel.py +26 -8
  3. src/exllama.py +5 -5
  4. src/llm_perf.py +1 -0
  5. src/utils.py +4 -0
src/content.py CHANGED
@@ -7,7 +7,7 @@ The πŸ€— LLM-Perf Leaderboard πŸ‹οΈ aims to benchmark the performance (latency
7
 
8
  Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
9
  - Model evaluation requests should be made in the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the [πŸ€— LLM Performance Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
10
- - Hardware/Backend/Optimization performance requests should be made in the [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) to assess their relevance and feasibility.
11
  """
12
 
13
  ABOUT = """<h3>About the πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h3>
 
7
 
8
  Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
9
  - Model evaluation requests should be made in the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the [πŸ€— LLM Performance Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
10
+ - Hardware/Backend/Optimization performance requests should be made in the [llm-perf-backend repository](https://github.com/IlyasMoutawwakil/llm-perf-backend) and will be added to the [πŸ€— LLM Performance Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
11
  """
12
 
13
  ABOUT = """<h3>About the πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h3>
src/control_panel.py CHANGED
@@ -10,7 +10,7 @@ from src.exllama import get_exllama_prefill_fig, get_exllama_decode_fig
10
 
11
  def create_control_panel(machine: str = "hf-dgx-01"):
12
  # descriptive text
13
- gr.HTML("Use this control panel to filter this leaderboard.", elem_id="text")
14
  # controls
15
  machine_textbox = gr.Textbox(value=machine, visible=False)
16
  with gr.Row():
@@ -21,14 +21,14 @@ def create_control_panel(machine: str = "hf-dgx-01"):
21
  elem_id="search-bar",
22
  )
23
  with gr.Row():
24
- with gr.Column(scale=1):
25
  score_slider = gr.Slider(
26
  label="Open LLM Score (%) πŸ“ˆ",
27
  info="🎚️ Slide to minimum Open LLM score",
28
  value=0,
29
  elem_id="threshold-slider",
30
  )
31
- with gr.Column(scale=1):
32
  memory_slider = gr.Slider(
33
  label="Peak Memory (MB) πŸ“ˆ",
34
  info="🎚️ Slide to maximum Peak Memory",
@@ -46,7 +46,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
46
  elem_id="backend-checkboxes",
47
  )
48
  with gr.Row():
49
- with gr.Column(scale=1):
50
  datatype_checkboxes = gr.CheckboxGroup(
51
  label="Load DTypes πŸ“₯",
52
  choices=["float32", "float16", "bfloat16"],
@@ -54,7 +54,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
54
  info="β˜‘οΈ Select the load data types",
55
  elem_id="dtype-checkboxes",
56
  )
57
- with gr.Column(scale=1):
58
  optimization_checkboxes = gr.CheckboxGroup(
59
  label="Optimizations πŸ› οΈ",
60
  choices=["None", "BetterTransformer", "FlashAttentionV2"],
@@ -62,11 +62,29 @@ def create_control_panel(machine: str = "hf-dgx-01"):
62
  info="β˜‘οΈ Select the optimization",
63
  elem_id="optimization-checkboxes",
64
  )
65
- with gr.Column(scale=1):
66
  quantization_checkboxes = gr.CheckboxGroup(
67
  label="Quantizations πŸ—œοΈ",
68
- choices=["None", "BnB.4bit", "BnB.8bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
69
- value=["None", "BnB.4bit", "BnB.8bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  info="β˜‘οΈ Select the quantization schemes",
71
  elem_id="quantization-checkboxes",
72
  )
 
10
 
11
  def create_control_panel(machine: str = "hf-dgx-01"):
12
  # descriptive text
13
+ gr.HTML("Use this control panel to filter the leaderboard.", elem_id="text")
14
  # controls
15
  machine_textbox = gr.Textbox(value=machine, visible=False)
16
  with gr.Row():
 
21
  elem_id="search-bar",
22
  )
23
  with gr.Row():
24
+ with gr.Column(scale=1, variant="panel"):
25
  score_slider = gr.Slider(
26
  label="Open LLM Score (%) πŸ“ˆ",
27
  info="🎚️ Slide to minimum Open LLM score",
28
  value=0,
29
  elem_id="threshold-slider",
30
  )
31
+ with gr.Column(scale=1, variant="panel"):
32
  memory_slider = gr.Slider(
33
  label="Peak Memory (MB) πŸ“ˆ",
34
  info="🎚️ Slide to maximum Peak Memory",
 
46
  elem_id="backend-checkboxes",
47
  )
48
  with gr.Row():
49
+ with gr.Column(scale=1, variant="panel"):
50
  datatype_checkboxes = gr.CheckboxGroup(
51
  label="Load DTypes πŸ“₯",
52
  choices=["float32", "float16", "bfloat16"],
 
54
  info="β˜‘οΈ Select the load data types",
55
  elem_id="dtype-checkboxes",
56
  )
57
+ with gr.Column(scale=1, variant="panel"):
58
  optimization_checkboxes = gr.CheckboxGroup(
59
  label="Optimizations πŸ› οΈ",
60
  choices=["None", "BetterTransformer", "FlashAttentionV2"],
 
62
  info="β˜‘οΈ Select the optimization",
63
  elem_id="optimization-checkboxes",
64
  )
65
+ with gr.Column(scale=2):
66
  quantization_checkboxes = gr.CheckboxGroup(
67
  label="Quantizations πŸ—œοΈ",
68
+ choices=[
69
+ "None",
70
+ "BnB.4bit",
71
+ "BnB.8bit",
72
+ "GPTQ.4bit",
73
+ "GPTQ.4bit+ExllamaV1",
74
+ "GPTQ.4bit+ExllamaV2",
75
+ "AWQ.4bit+GEMM",
76
+ "AWQ.4bit+GEMV",
77
+ ],
78
+ value=[
79
+ "None",
80
+ "BnB.4bit",
81
+ "BnB.8bit",
82
+ "GPTQ.4bit",
83
+ "GPTQ.4bit+ExllamaV1",
84
+ "GPTQ.4bit+ExllamaV2",
85
+ "AWQ.4bit+GEMM",
86
+ "AWQ.4bit+GEMV",
87
+ ],
88
  info="β˜‘οΈ Select the quantization schemes",
89
  elem_id="quantization-checkboxes",
90
  )
src/exllama.py CHANGED
@@ -29,11 +29,11 @@ EXLLAMA_DATA = [
29
 
30
 
31
  def get_exllama_df(llm_perf_df):
32
- exllama_df = llm_perf_df.copy()
33
- # seperate original model experiments from Exllama experiments
34
- gptq_df = exllama_df[(exllama_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit")]
35
- exllamav1_df = exllama_df[(exllama_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
36
- exllamav2_df = exllama_df[(exllama_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
37
  # merge the three dataframes
38
  exllamav1_df = pd.merge(
39
  gptq_df,
 
29
 
30
 
31
  def get_exllama_df(llm_perf_df):
32
+ copy_df = llm_perf_df.copy()
33
+ # seperate vanilla GPTQ experiments from Exllama experiments
34
+ gptq_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit")]
35
+ exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
36
+ exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
37
  # merge the three dataframes
38
  exllamav1_df = pd.merge(
39
  gptq_df,
src/llm_perf.py CHANGED
@@ -94,6 +94,7 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
94
  [
95
  "backend.quantization_scheme",
96
  "backend.quantization_config.bits",
 
97
  "backend.quantization_config.load_in_4bit",
98
  "backend.quantization_config.load_in_8bit",
99
  "backend.quantization_config.exllama_config.version",
 
94
  [
95
  "backend.quantization_scheme",
96
  "backend.quantization_config.bits",
97
+ "backend.quantization_config.version",
98
  "backend.quantization_config.load_in_4bit",
99
  "backend.quantization_config.load_in_8bit",
100
  "backend.quantization_config.exllama_config.version",
src/utils.py CHANGED
@@ -62,6 +62,10 @@ def process_quantization_scheme(x):
62
  return "GPTQ.4bit+ExllamaV2"
63
  elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
64
  return "GPTQ.4bit"
 
 
 
 
65
  else:
66
  return "None"
67
 
 
62
  return "GPTQ.4bit+ExllamaV2"
63
  elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
64
  return "GPTQ.4bit"
65
+ elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemm":
66
+ return "AWQ.4bit+GEMM"
67
+ elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemv":
68
+ return "AWQ.4bit+GEMV"
69
  else:
70
  return "None"
71