meg-huggingface commited on
Commit
939c209
·
1 Parent(s): b891a6a

Adding Traceback handling

Browse files
Files changed (4) hide show
  1. Dockerfile +1 -2
  2. create_results.py +0 -39
  3. entrypoint.sh +18 -9
  4. process_runs.py +111 -0
Dockerfile CHANGED
@@ -21,7 +21,6 @@ RUN mkdir -p .cache
21
  RUN chmod 777 -R .cache
22
  #RUN chmod 777 -R data
23
 
24
-
25
  RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
26
  build-essential \
27
  ca-certificates \
@@ -61,7 +60,7 @@ COPY ./.cache /.cache
61
  COPY ./entrypoint.sh /entrypoint.sh
62
  COPY ./pause_space.py /pause_space.py
63
  COPY ./parse_requests.py /parse_requests.py
64
- COPY ./create_results.py /create_results.py
65
  COPY ./runs /runs
66
  COPY ./attempts.txt /attempts.txt
67
  COPY ./failed_attempts.txt /failed_attempts.txt
 
21
  RUN chmod 777 -R .cache
22
  #RUN chmod 777 -R data
23
 
 
24
  RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
25
  build-essential \
26
  ca-certificates \
 
60
  COPY ./entrypoint.sh /entrypoint.sh
61
  COPY ./pause_space.py /pause_space.py
62
  COPY ./parse_requests.py /parse_requests.py
63
+ COPY ./create_results.py /process_runs.py
64
  COPY ./runs /runs
65
  COPY ./attempts.txt /attempts.txt
66
  COPY ./failed_attempts.txt /failed_attempts.txt
create_results.py DELETED
@@ -1,39 +0,0 @@
1
- import os
2
- import sys
3
- from datasets import load_dataset, Dataset
4
- from huggingface_hub import HfApi
5
- import pandas as pd
6
-
7
- TOKEN = os.environ.get("DEBUG")
8
-
9
- api = HfApi(token=TOKEN)
10
-
11
- out_dir = sys.argv[1]
12
- all_attempts_read = open("attempts.txt", "r+").readlines()
13
- failed_attempts_read = open("failed_attempts.txt", "r+").readlines()
14
-
15
- # Uploading output to the results dataset.
16
- api.upload_folder(
17
- folder_path=out_dir,
18
- repo_id="AIEnergyScore/results_debug",
19
- repo_type="dataset",
20
- )
21
-
22
- # Updating requests
23
- requests = load_dataset("AIEnergyScore/requests_debug", split="test",
24
- token=TOKEN)
25
- requests_dset = requests.to_pandas()
26
-
27
- for line in all_attempts_read:
28
- experiment_name, model = line.strip().split(',')
29
- if line not in failed_attempts_read:
30
- requests_dset.loc[
31
- requests_dset["model"] == model, ['status']] = "COMPLETED"
32
- else:
33
- requests_dset.loc[
34
- requests_dset["model"] == model, ['status']] = "FAILED"
35
-
36
- updated_dset = Dataset.from_pandas(requests_dset)
37
- updated_dset.push_to_hub("AIEnergyScore/requests_debug", split="test",
38
- token=TOKEN)
39
- print("Updated model status")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
entrypoint.sh CHANGED
@@ -1,27 +1,36 @@
1
  #!/bin/bash
2
 
 
3
  export SPACE="AIEnergyScore/launch-computation-example"
4
 
5
- echo "Attempting to run."
 
 
6
 
 
7
  # For each line in the requests dataset....
8
  python /parse_requests.py | while read -r line; do
9
- # Read the name of the model and the experiment.
10
- IFS="," read backend_model experiment_name <<< "${line}"
11
- echo "Benchmarking Model: ${backend_model}, Task: ${experiment_name}"
12
 
13
  # Initialize the directory for output.
14
  now=$(date +%Y-%m-%d-%H-%M-%S)
15
- run_dir="./runs/${experiment_name}/${backend_model}/${now}"
16
  mkdir -p "$run_dir"
17
- echo "${experiment_name},${backend_model}" >> /attempts.txt
 
18
 
19
- # Let the benchmarking begin!
20
- optimum-benchmark --config-name "${experiment_name}" --config-dir /optimum-benchmark/examples/energy_star/ backend.model="${backend_model}" backend.processor="${backend_model}" hydra.run.dir="${run_dir}" 2> "${run_dir}/error.log" || echo "${experiment_name},${backend_model}" >> /failed_attempts.txt
 
 
 
 
21
  done
22
 
23
  echo "Finished; updating requests dataset and results dataset."
24
- python /create_results.py ./runs
25
 
26
  # Pausing space
27
  echo "Pausing space."
 
1
  #!/bin/bash
2
 
3
+ # TODO: Why is this here? Can we delete it?
4
  export SPACE="AIEnergyScore/launch-computation-example"
5
 
6
+ # Can use this for errors too: trap 'echo "An error occurred."' ERR
7
+
8
+ config_dir="/optimum-benchmark/examples/energy_star/"
9
 
10
+ echo "Attempting to run."
11
  # For each line in the requests dataset....
12
  python /parse_requests.py | while read -r line; do
13
+ # Read the name of the model and the experiment (task).
14
+ IFS="," read model task <<< "${line}"
15
+ echo "Benchmarking Model: ${model}, Task: ${task}"
16
 
17
  # Initialize the directory for output.
18
  now=$(date +%Y-%m-%d-%H-%M-%S)
19
+ run_dir="/runs/${task}/${model}/${now}"
20
  mkdir -p "$run_dir"
21
+ # Save the task/model run directory to text file, for tracking purposes.
22
+ echo "${run_dir}" >> /attempts.txt
23
 
24
+ { # try
25
+ # Let the benchmarking begin!
26
+ optimum-benchmark --config-name "${task}" --config-dir="${config_dir}" backend.model="${model}" backend.processor="${model}" hydra.run.dir="${run_dir}" 2> "${run_dir}/error.log"
27
+ } || { # catch
28
+ echo "${run_dir}" >> /failed_attempts.txt
29
+ }
30
  done
31
 
32
  echo "Finished; updating requests dataset and results dataset."
33
+ python /process_runs.py
34
 
35
  # Pausing space
36
  echo "Pausing space."
process_runs.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from datasets import load_dataset, Dataset
4
+ from huggingface_hub import HfApi
5
+
6
+ TOKEN = os.environ.get("DEBUG")
7
+ api = HfApi(token=TOKEN)
8
+
9
+ REQUESTS_DSET = "AIEnergyScore/requests_debug"
10
+ RESULTS_DSET = "AIEnergyScore/results_debug"
11
+ PENDING = 'PENDING'
12
+ COMPLETED = 'COMPLETED'
13
+ FAILED = 'FAILED'
14
+
15
+ def parse_args():
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument(
18
+ "--run_dir",
19
+ default="/runs",
20
+ type=str,
21
+ required=False,
22
+ help="Path to the run directory.",
23
+ )
24
+ parser.add_argument(
25
+ "--attempts",
26
+ default="/attempts.txt",
27
+ type=str,
28
+ required=False,
29
+ help="File with per-line run attempt directories. Assumes format '/runs/{task}/{model}/{timestamp}'",
30
+ )
31
+ parser.add_argument(
32
+ "--failed_attempts",
33
+ default="/failed_attempts.txt",
34
+ type=str,
35
+ required=False,
36
+ help="File with per-line failed run directories. Assumes format '/runs/{task}/{model}/{timestamp}'",
37
+ )
38
+ args = parser.parse_args()
39
+ return args
40
+
41
+ def check_for_traceback(run_dir):
42
+ # run_dir="./runs/${experiment_name}/${backend_model}/${now}"
43
+ found_error = False
44
+ error_message = ""
45
+ try:
46
+ # Read error message
47
+ with open(f"{run_dir}/error.log", 'r') as f:
48
+ # There may be a better way to do this that finds the
49
+ # index of Traceback, then prints from there : end-of-file index (the file length-1).
50
+ for line in f:
51
+ # Question: Do we even need to check for this? The presence of the
52
+ # error file, or at least a non-empty one,
53
+ # means there's been an error, no?
54
+ if 'Traceback (most recent call last):' in line:
55
+ found_error = True
56
+ if found_error:
57
+ error_message += line
58
+ except FileNotFoundError as e:
59
+ # When does this happen?
60
+ print(f"Could not find {run_dir}/error.log")
61
+ return error_message
62
+
63
+ def update_requests(requests, all_attempts, failed_attempts):
64
+ """
65
+ Sets All PENDING requests with the given model & task to 'COMPLETED' or 'FAILED.'
66
+ Reads in the all_attempts text file and failed_attempts text file, in which
67
+ each line is a run directory run_dir="/runs/${experiment_name}/${backend_model}/${now}"
68
+
69
+ :param requests: requests Dataset
70
+ :param all_attempts: text file of the run directories of each task/model/timestamp
71
+ :param failed_attempts: text file of the run directories of each task/model/timestamp
72
+ :return:
73
+ """
74
+ requests_df = requests.to_pandas()
75
+ # Each line is a run directory, where
76
+ # run_dir="/runs/${experiment_name}/${backend_model}/${now}"
77
+ for line in all_attempts:
78
+ split_run_dir = line.strip().split("/")
79
+ task = split_run_dir[1]
80
+ model = split_run_dir[2]
81
+ if line not in failed_attempts:
82
+ traceback_error = check_for_traceback(line)
83
+ if traceback_error != "":
84
+ print("Found a traceback error!")
85
+ print(traceback_error)
86
+ requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['status']] = FAILED
87
+ requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['error_message']] = traceback_error
88
+ else:
89
+ requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['status']] = COMPLETED
90
+ updated_dset = Dataset.from_pandas(requests_df)
91
+ return updated_dset
92
+
93
+ if __name__ == '__main__':
94
+ args = parse_args()
95
+ # Uploads all run output to the results dataset.
96
+ print(f"Uploading {args.run_dir} to {RESULTS_DSET}")
97
+ api.upload_folder(
98
+ folder_path=args.run_dir,
99
+ repo_id=f"{RESULTS_DSET}",
100
+ repo_type="dataset",
101
+ )
102
+ # Update requests dataset based on whether things have failed or not.
103
+ print(f"Examining the run directory for each model & task to determine if it {FAILED} or {COMPLETED}.")
104
+ print(f"Setting the corresponding line in {REQUESTS_DSET} to {FAILED} or {COMPLETED} based on what's in the directory.")
105
+ requests = load_dataset(f"{REQUESTS_DSET}", split="test", token=TOKEN)
106
+ all_attempts = open(f"{args.attempts}", "r+").readlines()
107
+ failed_attempts = open(f"{args.failed_attempts}", "r+").readlines()
108
+ updated_requests = update_requests(requests, all_attempts, failed_attempts)
109
+ print(f"Uploading updated {REQUESTS_DSET}.")
110
+ updated_requests.push_to_hub(f"{REQUESTS_DSET}", split="test", token=TOKEN)
111
+ print("Done.")