pminervini commited on
Commit
e75cd03
·
1 Parent(s): ceca114
cli/analysis-cli.py CHANGED
@@ -72,6 +72,7 @@ def sanitise_dataset(name: str) -> str:
72
  res = res.replace("summarization", "Summarization")
73
  res = res.replace("dialogue", "Dialog")
74
  res = res.replace("halueval", "HaluEval")
 
75
  res = res.replace("_", " ")
76
  return res
77
 
@@ -135,11 +136,6 @@ if data_map is None:
135
  if 'memo-trap_v2' in dataset_name:
136
  to_add = False
137
 
138
- if 'selfcheck' in dataset_name:
139
- # if 'max' in metric_name:
140
- # to_add = False
141
- pass
142
-
143
  if 'faithdial' in dataset_name:
144
  to_add = False
145
 
@@ -166,7 +162,7 @@ if data_map is None:
166
  if 'fever' in dataset_name:
167
  to_add = False
168
 
169
- if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' in dataset_name:
170
  to_add = False
171
 
172
  if isinstance(value, str):
 
72
  res = res.replace("summarization", "Summarization")
73
  res = res.replace("dialogue", "Dialog")
74
  res = res.replace("halueval", "HaluEval")
75
+ res = res.replace("_v2", "")
76
  res = res.replace("_", " ")
77
  return res
78
 
 
136
  if 'memo-trap_v2' in dataset_name:
137
  to_add = False
138
 
 
 
 
 
 
139
  if 'faithdial' in dataset_name:
140
  to_add = False
141
 
 
162
  if 'fever' in dataset_name:
163
  to_add = False
164
 
165
+ if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' not in dataset_name:
166
  to_add = False
167
 
168
  if isinstance(value, str):
cli/completed-cli.py CHANGED
@@ -97,7 +97,7 @@ def process_finished_requests() -> bool:
97
  random.shuffle(eval_requests)
98
 
99
  from src.leaderboard.read_evals import get_raw_eval_results
100
- eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND, True)
101
 
102
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
103
  result_name_to_result = {r.eval_name: r for r in eval_results}
@@ -117,7 +117,7 @@ def process_finished_requests() -> bool:
117
  eval_request: EvalRequest = result_name_to_request[result_name]
118
 
119
  # print(eval_result)
120
- print(result_name, 'is incomplete -- missing task:', task_name)
121
 
122
 
123
  if __name__ == "__main__":
 
97
  random.shuffle(eval_requests)
98
 
99
  from src.leaderboard.read_evals import get_raw_eval_results
100
+ eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
101
 
102
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
103
  result_name_to_result = {r.eval_name: r for r in eval_results}
 
117
  eval_request: EvalRequest = result_name_to_request[result_name]
118
 
119
  # print(eval_result)
120
+ print(result_name, 'is incomplete -- missing task:', task_name, eval_result, eval_request.likes)
121
 
122
 
123
  if __name__ == "__main__":
cli/eval-cli.py CHANGED
@@ -8,15 +8,21 @@ from src.backend.manage_requests import EvalRequest
8
  from src.backend.run_eval_suite import run_evaluation
9
 
10
  from src.backend.tasks.xsum.task import XSum
 
 
11
  from src.backend.tasks.cnndm.task import CNNDM
12
- from src.backend.tasks.selfcheckgpt.task import SelfCheckGpt
 
 
13
 
14
- from lm_eval.tasks import initialize_tasks, include_task_folder
15
  from lm_eval import tasks, evaluator, utils
16
 
17
  from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
18
  from src.envs import QUEUE_REPO
19
 
 
 
20
 
21
  def main():
22
  # snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
@@ -29,15 +35,24 @@ def main():
29
  status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
30
 
31
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
32
- eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
33
- eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
 
 
 
 
34
 
35
  # my_task = Task("memo-trap", "acc", "memo-trap", 0)
36
  # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
37
  # my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
38
  # my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
39
  # my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
40
- my_task = Task("fever11", "acc", "FEVER", 8)
 
 
 
 
 
41
 
42
  eval_logger = utils.eval_logger
43
  import logging
@@ -47,12 +62,13 @@ def main():
47
  # task_names = ['triviaqa']
48
  # TASKS_HARNESS = [task.value for task in Tasks]
49
 
50
- include_task_folder("src/backend/tasks/")
51
- initialize_tasks('INFO')
 
52
 
53
  # breakpoint()
54
 
55
- print(tasks.ALL_TASKS)
56
 
57
  for task in TASKS_HARNESS:
58
  print(f"Selected Tasks: [{task}]")
@@ -60,7 +76,7 @@ def main():
60
 
61
  # breakpoint()
62
  results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
63
- batch_size=1, device="mps", use_cache=None, limit=1000, write_out=True)
64
  print('AAA', results["results"])
65
 
66
  breakpoint()
 
8
  from src.backend.run_eval_suite import run_evaluation
9
 
10
  from src.backend.tasks.xsum.task import XSum
11
+ from src.backend.tasks.xsum.task_v2 import XSumv2
12
+
13
  from src.backend.tasks.cnndm.task import CNNDM
14
+ from src.backend.tasks.cnndm.task_v2 import CNNDMv2
15
+
16
+ from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
17
 
18
+ from lm_eval.tasks import TaskManager
19
  from lm_eval import tasks, evaluator, utils
20
 
21
  from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
22
  from src.envs import QUEUE_REPO
23
 
24
+ from lm_eval.models.huggingface import HFLM
25
+
26
 
27
  def main():
28
  # snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 
35
  status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
36
 
37
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
38
+ eval_requests: list[EvalRequest] = get_eval_requests(job_status=status,
39
+ hf_repo=QUEUE_REPO,
40
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
41
+ do_download=False)
42
+ # eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
43
+ eval_request = [r for r in eval_requests if 'meta-llama/Llama-2-7b-hf' in r.model][0]
44
 
45
  # my_task = Task("memo-trap", "acc", "memo-trap", 0)
46
  # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
47
  # my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
48
  # my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
49
  # my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
50
+
51
+ # my_task = Task("nq_swap", "exact_match", "NQ-Swap", 2)
52
+ # my_task = Task("memo-trap_v2", "acc", "XXX", 2)
53
+ my_task = Task("xsum_v2", "rougeL", "XXX", 0)
54
+ # my_task = Task("squadv2", "exact", "XXX", 0)
55
+ # my_task = Task("scrolls_qasper", "f1", "XXX", 0)
56
 
57
  eval_logger = utils.eval_logger
58
  import logging
 
62
  # task_names = ['triviaqa']
63
  # TASKS_HARNESS = [task.value for task in Tasks]
64
 
65
+ # include_task_folder("src/backend/tasks/")
66
+ task_manager = TaskManager(include_path="./src/backend/tasks/")
67
+ # task_manager.initialize_tasks(include_path="src/backend/tasks/")
68
 
69
  # breakpoint()
70
 
71
+ print(task_manager.all_tasks)
72
 
73
  for task in TASKS_HARNESS:
74
  print(f"Selected Tasks: [{task}]")
 
76
 
77
  # breakpoint()
78
  results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
79
+ batch_size=1, device="mps", use_cache=None, limit=2, write_out=True, task_manager=task_manager)
80
  print('AAA', results["results"])
81
 
82
  breakpoint()
cli/nqswap-upload-cli.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ from datasets import load_dataset
4
+
5
+ path = 'pminervini/NQ-Swap'
6
+
7
+ ds = load_dataset("json",
8
+ data_files={
9
+ 'original': 'nqswap/original.jsonl',
10
+ 'substituted': 'nqswap/substituted.jsonl'
11
+ })
12
+ ds.push_to_hub(path)
cli/nqswap/original.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
cli/nqswap/substituted.jsonl ADDED
The diff for this file is too large to render. See raw diff