Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Commit
·
e75cd03
1
Parent(s):
ceca114
update
Browse files- cli/analysis-cli.py +2 -6
- cli/completed-cli.py +2 -2
- cli/eval-cli.py +25 -9
- cli/nqswap-upload-cli.py +12 -0
- cli/nqswap/original.jsonl +0 -0
- cli/nqswap/substituted.jsonl +0 -0
cli/analysis-cli.py
CHANGED
@@ -72,6 +72,7 @@ def sanitise_dataset(name: str) -> str:
|
|
72 |
res = res.replace("summarization", "Summarization")
|
73 |
res = res.replace("dialogue", "Dialog")
|
74 |
res = res.replace("halueval", "HaluEval")
|
|
|
75 |
res = res.replace("_", " ")
|
76 |
return res
|
77 |
|
@@ -135,11 +136,6 @@ if data_map is None:
|
|
135 |
if 'memo-trap_v2' in dataset_name:
|
136 |
to_add = False
|
137 |
|
138 |
-
if 'selfcheck' in dataset_name:
|
139 |
-
# if 'max' in metric_name:
|
140 |
-
# to_add = False
|
141 |
-
pass
|
142 |
-
|
143 |
if 'faithdial' in dataset_name:
|
144 |
to_add = False
|
145 |
|
@@ -166,7 +162,7 @@ if data_map is None:
|
|
166 |
if 'fever' in dataset_name:
|
167 |
to_add = False
|
168 |
|
169 |
-
if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' in dataset_name:
|
170 |
to_add = False
|
171 |
|
172 |
if isinstance(value, str):
|
|
|
72 |
res = res.replace("summarization", "Summarization")
|
73 |
res = res.replace("dialogue", "Dialog")
|
74 |
res = res.replace("halueval", "HaluEval")
|
75 |
+
res = res.replace("_v2", "")
|
76 |
res = res.replace("_", " ")
|
77 |
return res
|
78 |
|
|
|
136 |
if 'memo-trap_v2' in dataset_name:
|
137 |
to_add = False
|
138 |
|
|
|
|
|
|
|
|
|
|
|
139 |
if 'faithdial' in dataset_name:
|
140 |
to_add = False
|
141 |
|
|
|
162 |
if 'fever' in dataset_name:
|
163 |
to_add = False
|
164 |
|
165 |
+
if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' not in dataset_name:
|
166 |
to_add = False
|
167 |
|
168 |
if isinstance(value, str):
|
cli/completed-cli.py
CHANGED
@@ -97,7 +97,7 @@ def process_finished_requests() -> bool:
|
|
97 |
random.shuffle(eval_requests)
|
98 |
|
99 |
from src.leaderboard.read_evals import get_raw_eval_results
|
100 |
-
eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND
|
101 |
|
102 |
result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
|
103 |
result_name_to_result = {r.eval_name: r for r in eval_results}
|
@@ -117,7 +117,7 @@ def process_finished_requests() -> bool:
|
|
117 |
eval_request: EvalRequest = result_name_to_request[result_name]
|
118 |
|
119 |
# print(eval_result)
|
120 |
-
print(result_name, 'is incomplete -- missing task:', task_name)
|
121 |
|
122 |
|
123 |
if __name__ == "__main__":
|
|
|
97 |
random.shuffle(eval_requests)
|
98 |
|
99 |
from src.leaderboard.read_evals import get_raw_eval_results
|
100 |
+
eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
|
101 |
|
102 |
result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
|
103 |
result_name_to_result = {r.eval_name: r for r in eval_results}
|
|
|
117 |
eval_request: EvalRequest = result_name_to_request[result_name]
|
118 |
|
119 |
# print(eval_result)
|
120 |
+
print(result_name, 'is incomplete -- missing task:', task_name, eval_result, eval_request.likes)
|
121 |
|
122 |
|
123 |
if __name__ == "__main__":
|
cli/eval-cli.py
CHANGED
@@ -8,15 +8,21 @@ from src.backend.manage_requests import EvalRequest
|
|
8 |
from src.backend.run_eval_suite import run_evaluation
|
9 |
|
10 |
from src.backend.tasks.xsum.task import XSum
|
|
|
|
|
11 |
from src.backend.tasks.cnndm.task import CNNDM
|
12 |
-
from src.backend.tasks.
|
|
|
|
|
13 |
|
14 |
-
from lm_eval.tasks import
|
15 |
from lm_eval import tasks, evaluator, utils
|
16 |
|
17 |
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
|
18 |
from src.envs import QUEUE_REPO
|
19 |
|
|
|
|
|
20 |
|
21 |
def main():
|
22 |
# snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
@@ -29,15 +35,24 @@ def main():
|
|
29 |
status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
|
30 |
|
31 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
32 |
-
eval_requests: list[EvalRequest] = get_eval_requests(job_status=status,
|
33 |
-
|
|
|
|
|
|
|
|
|
34 |
|
35 |
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
36 |
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
37 |
# my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
38 |
# my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
|
39 |
# my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
eval_logger = utils.eval_logger
|
43 |
import logging
|
@@ -47,12 +62,13 @@ def main():
|
|
47 |
# task_names = ['triviaqa']
|
48 |
# TASKS_HARNESS = [task.value for task in Tasks]
|
49 |
|
50 |
-
include_task_folder("src/backend/tasks/")
|
51 |
-
|
|
|
52 |
|
53 |
# breakpoint()
|
54 |
|
55 |
-
print(
|
56 |
|
57 |
for task in TASKS_HARNESS:
|
58 |
print(f"Selected Tasks: [{task}]")
|
@@ -60,7 +76,7 @@ def main():
|
|
60 |
|
61 |
# breakpoint()
|
62 |
results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
|
63 |
-
batch_size=1, device="mps", use_cache=None, limit=
|
64 |
print('AAA', results["results"])
|
65 |
|
66 |
breakpoint()
|
|
|
8 |
from src.backend.run_eval_suite import run_evaluation
|
9 |
|
10 |
from src.backend.tasks.xsum.task import XSum
|
11 |
+
from src.backend.tasks.xsum.task_v2 import XSumv2
|
12 |
+
|
13 |
from src.backend.tasks.cnndm.task import CNNDM
|
14 |
+
from src.backend.tasks.cnndm.task_v2 import CNNDMv2
|
15 |
+
|
16 |
+
from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
|
17 |
|
18 |
+
from lm_eval.tasks import TaskManager
|
19 |
from lm_eval import tasks, evaluator, utils
|
20 |
|
21 |
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
|
22 |
from src.envs import QUEUE_REPO
|
23 |
|
24 |
+
from lm_eval.models.huggingface import HFLM
|
25 |
+
|
26 |
|
27 |
def main():
|
28 |
# snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
|
|
35 |
status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
|
36 |
|
37 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
38 |
+
eval_requests: list[EvalRequest] = get_eval_requests(job_status=status,
|
39 |
+
hf_repo=QUEUE_REPO,
|
40 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
41 |
+
do_download=False)
|
42 |
+
# eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
43 |
+
eval_request = [r for r in eval_requests if 'meta-llama/Llama-2-7b-hf' in r.model][0]
|
44 |
|
45 |
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
46 |
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
47 |
# my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
48 |
# my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
|
49 |
# my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
|
50 |
+
|
51 |
+
# my_task = Task("nq_swap", "exact_match", "NQ-Swap", 2)
|
52 |
+
# my_task = Task("memo-trap_v2", "acc", "XXX", 2)
|
53 |
+
my_task = Task("xsum_v2", "rougeL", "XXX", 0)
|
54 |
+
# my_task = Task("squadv2", "exact", "XXX", 0)
|
55 |
+
# my_task = Task("scrolls_qasper", "f1", "XXX", 0)
|
56 |
|
57 |
eval_logger = utils.eval_logger
|
58 |
import logging
|
|
|
62 |
# task_names = ['triviaqa']
|
63 |
# TASKS_HARNESS = [task.value for task in Tasks]
|
64 |
|
65 |
+
# include_task_folder("src/backend/tasks/")
|
66 |
+
task_manager = TaskManager(include_path="./src/backend/tasks/")
|
67 |
+
# task_manager.initialize_tasks(include_path="src/backend/tasks/")
|
68 |
|
69 |
# breakpoint()
|
70 |
|
71 |
+
print(task_manager.all_tasks)
|
72 |
|
73 |
for task in TASKS_HARNESS:
|
74 |
print(f"Selected Tasks: [{task}]")
|
|
|
76 |
|
77 |
# breakpoint()
|
78 |
results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
|
79 |
+
batch_size=1, device="mps", use_cache=None, limit=2, write_out=True, task_manager=task_manager)
|
80 |
print('AAA', results["results"])
|
81 |
|
82 |
breakpoint()
|
cli/nqswap-upload-cli.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
from datasets import load_dataset
|
4 |
+
|
5 |
+
path = 'pminervini/NQ-Swap'
|
6 |
+
|
7 |
+
ds = load_dataset("json",
|
8 |
+
data_files={
|
9 |
+
'original': 'nqswap/original.jsonl',
|
10 |
+
'substituted': 'nqswap/substituted.jsonl'
|
11 |
+
})
|
12 |
+
ds.push_to_hub(path)
|
cli/nqswap/original.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
cli/nqswap/substituted.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|