Adam
commited on
Commit
Β·
13f157d
1
Parent(s):
4e8b8a4
feat: updated links
Browse files- training.log +91 -91
training.log
CHANGED
@@ -1,125 +1,125 @@
|
|
1 |
-
/home/
|
2 |
warnings.warn(
|
3 |
[2023-04-14 06:58:31,332] [WARNING] [runner.py:190:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
|
4 |
-
[2023-04-14 06:58:32,784] [INFO] [runner.py:540:main] cmd = /home/
|
5 |
-
/home/
|
6 |
warnings.warn(
|
7 |
[2023-04-14 06:59:25,659] [INFO] [launch.py:229:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}
|
8 |
[2023-04-14 06:59:25,760] [INFO] [launch.py:235:main] nnodes=1, num_local_procs=8, node_rank=0
|
9 |
[2023-04-14 06:59:25,760] [INFO] [launch.py:246:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]})
|
10 |
[2023-04-14 06:59:25,760] [INFO] [launch.py:247:main] dist_world_size=8
|
11 |
[2023-04-14 06:59:25,760] [INFO] [launch.py:249:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
12 |
-
/home/
|
13 |
warnings.warn(
|
14 |
-
/home/
|
15 |
warnings.warn(
|
16 |
-
/home/
|
17 |
warnings.warn(
|
18 |
-
/home/
|
19 |
warnings.warn(
|
20 |
-
/home/
|
21 |
warnings.warn(
|
22 |
-
/home/
|
23 |
warnings.warn(
|
24 |
-
/home/
|
25 |
warnings.warn(
|
26 |
-
/home/
|
27 |
warnings.warn(
|
28 |
[2023-04-14 07:04:01,148] [INFO] [comm.py:586:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
29 |
-
Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
30 |
-
|
31 |
0%| | 0/2 [00:00<?, ?it/s]
|
32 |
50%|βββββ | 1/2 [00:00<00:00, 8.62it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
|
|
|
|
33 |
0%| | 0/2 [00:00<?, ?it/s]
|
34 |
50%|βββββ | 1/2 [00:00<00:00, 8.62it/s]Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
35 |
|
36 |
0%| | 0/2 [00:00<?, ?it/s]
|
37 |
|
38 |
-
Found cached dataset parquet (/
|
39 |
|
40 |
0%| | 0/2 [00:00<?, ?it/s]
|
41 |
-
Found cached dataset parquet (/
|
42 |
|
43 |
0%| | 0/2 [00:00<?, ?it/s]
|
44 |
-
Found cached dataset parquet (/
|
45 |
|
46 |
0%| | 0/2 [00:00<?, ?it/s]
|
47 |
-
Found cached dataset parquet (/
|
48 |
|
49 |
0%| | 0/2 [00:00<?, ?it/s]
|
50 |
-
Found cached dataset parquet (/
|
51 |
|
52 |
0%| | 0/2 [00:00<?, ?it/s]
|
53 |
-
Found cached dataset parquet (/
|
54 |
|
55 |
0%| | 0/2 [00:00<?, ?it/s]
|
56 |
-
Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
57 |
-
|
58 |
0%| | 0/2 [00:00<?, ?it/s]
|
59 |
50%|βββββ | 1/2 [00:02<00:02, 2.37s/it]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
|
|
|
|
60 |
0%| | 0/2 [00:00<?, ?it/s]
|
61 |
50%|βββββ | 1/2 [00:02<00:02, 2.37s/it]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
62 |
|
63 |
|
64 |
0%| | 0/2 [00:00<?, ?it/s]
|
65 |
50%|βββββ | 1/2 [00:00<00:00, 1.76it/s]
|
66 |
-
Found cached dataset parquet (/
|
67 |
|
68 |
0%| | 0/2 [00:00<?, ?it/s]
|
69 |
-
Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
70 |
-
|
71 |
0%| | 0/2 [00:00<?, ?it/s]
|
72 |
50%|βββββ | 1/2 [00:01<00:01, 1.92s/it]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
|
|
|
|
73 |
0%| | 0/2 [00:00<?, ?it/s]
|
74 |
50%|βββββ | 1/2 [00:01<00:01, 1.92s/it]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
75 |
|
76 |
|
77 |
0%| | 0/2 [00:00<?, ?it/s]
|
78 |
-
Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
79 |
-
|
80 |
0%| | 0/2 [00:00<?, ?it/s]
|
81 |
50%|βββββ | 1/2 [00:00<00:00, 1.60it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
|
|
|
|
82 |
0%| | 0/2 [00:00<?, ?it/s]
|
83 |
50%|βββββ | 1/2 [00:00<00:00, 1.60it/s]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
84 |
|
85 |
-
Found cached dataset parquet (/
|
86 |
|
87 |
0%| | 0/2 [00:00<?, ?it/s]
|
88 |
|
89 |
0%| | 0/2 [00:00<?, ?it/s]
|
90 |
-
Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
91 |
-
|
92 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
|
|
|
|
93 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
94 |
|
95 |
-
|
96 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
|
|
97 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
98 |
|
99 |
|
100 |
0%| | 0/1 [00:00<?, ?it/s]
|
101 |
-
Found cached dataset parquet (/
|
102 |
-
Found cached dataset parquet (/
|
103 |
|
104 |
0%| | 0/1 [00:00<?, ?it/s]
|
105 |
|
106 |
0%| | 0/1 [00:00<?, ?it/s]
|
107 |
-
Found cached dataset parquet (/
|
108 |
|
109 |
0%| | 0/1 [00:00<?, ?it/s]
|
110 |
-
Found cached dataset parquet (/
|
111 |
|
112 |
0%| | 0/1 [00:00<?, ?it/s]
|
113 |
-
Found cached dataset parquet (/
|
114 |
|
115 |
0%| | 0/1 [00:00<?, ?it/s]
|
116 |
-
Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
117 |
-
|
118 |
0%| | 0/2 [00:00<?, ?it/s]
|
119 |
50%|βββββ | 1/2 [00:00<00:00, 5.87it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
|
|
|
|
120 |
0%| | 0/2 [00:00<?, ?it/s]
|
121 |
50%|βββββ | 1/2 [00:00<00:00, 5.87it/s]Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
122 |
|
123 |
0%| | 0/2 [00:00<?, ?it/s]
|
124 |
|
125 |
-
Found cached dataset parquet (/
|
126 |
|
127 |
0%| | 0/2 [00:00<?, ?it/s]
|
128 |
-
Found cached dataset parquet (/
|
129 |
|
130 |
0%| | 0/2 [00:00<?, ?it/s]
|
131 |
-
Found cached dataset parquet (/
|
132 |
|
133 |
0%| | 0/2 [00:00<?, ?it/s]
|
134 |
-
Found cached dataset parquet (/
|
135 |
|
136 |
0%| | 0/2 [00:00<?, ?it/s]
|
137 |
-
Found cached dataset parquet (/
|
138 |
|
139 |
0%| | 0/2 [00:00<?, ?it/s]
|
140 |
50%|βββββ | 1/2 [00:00<00:00, 3.00it/s]
|
141 |
-
Found cached dataset parquet (/
|
142 |
|
143 |
0%| | 0/2 [00:00<?, ?it/s]
|
144 |
-
Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
145 |
-
|
146 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
|
|
|
|
147 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
148 |
|
149 |
0%| | 0/1 [00:00<?, ?it/s]
|
150 |
|
151 |
-
Found cached dataset webgpt_comparisons (/
|
152 |
|
153 |
0%| | 0/1 [00:00<?, ?it/s]
|
154 |
-
Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
155 |
-
|
156 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
|
|
|
|
157 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
158 |
|
159 |
|
160 |
0%| | 0/1 [00:00<?, ?it/s]
|
161 |
-
Found cached dataset webgpt_comparisons (/
|
162 |
|
163 |
0%| | 0/1 [00:00<?, ?it/s]
|
164 |
-
Found cached dataset webgpt_comparisons (/
|
165 |
|
166 |
0%| | 0/1 [00:00<?, ?it/s]
|
167 |
-
Found cached dataset webgpt_comparisons (/
|
168 |
|
169 |
0%| | 0/1 [00:00<?, ?it/s]
|
170 |
-
Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
171 |
-
|
172 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
173 |
-
|
174 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
175 |
-
Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
176 |
-
|
177 |
0%| | 0/3 [00:00<?, ?it/s]
|
178 |
33%|ββββ | 1/3 [00:11<00:22, 11.28s/it]
|
179 |
33%|ββββ | 1/3 [00:08<00:17, 8.75s/it]
|
180 |
33%|ββββ | 1/3 [00:01<00:02, 1.24s/it]
|
181 |
0%| | 0/3 [00:00<?, ?it/s]
|
182 |
67%|βββββββ | 2/3 [00:02<00:01, 1.03s/it]
|
183 |
67%|βββββββ | 2/3 [00:09<00:04, 4.31s/it]
|
184 |
67%|βββββββ | 2/3 [00:12<00:05, 5.35s/it]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
185 |
-
|
186 |
33%|ββββ | 1/3 [00:03<00:07, 3.78s/it]
|
187 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
|
|
|
|
188 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
|
|
189 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
|
|
|
|
190 |
0%| | 0/3 [00:00<?, ?it/s]
|
191 |
33%|ββββ | 1/3 [00:11<00:22, 11.28s/it]
|
192 |
33%|ββββ | 1/3 [00:08<00:17, 8.75s/it]
|
193 |
33%|ββββ | 1/3 [00:01<00:02, 1.24s/it]
|
194 |
0%| | 0/3 [00:00<?, ?it/s]
|
195 |
67%|βββββββ | 2/3 [00:02<00:01, 1.03s/it]
|
196 |
67%|βββββββ | 2/3 [00:09<00:04, 4.31s/it]
|
197 |
67%|βββββββ | 2/3 [00:12<00:05, 5.35s/it]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
|
|
198 |
33%|ββββ | 1/3 [00:03<00:07, 3.78s/it]
|
199 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
200 |
|
201 |
33%|ββββ | 1/3 [00:00<00:00, 5.25it/s]
|
202 |
|
203 |
|
204 |
|
205 |
|
206 |
|
207 |
0%| | 0/3 [00:00<?, ?it/s]
|
208 |
-
Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
209 |
-
|
210 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
|
|
|
|
211 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
212 |
|
213 |
0%| | 0/3 [00:00<?, ?it/s]
|
214 |
33%|ββββ | 1/3 [00:05<00:11, 5.84s/it]
|
215 |
33%|ββββ | 1/3 [00:03<00:06, 3.03s/it]
|
216 |
67%|βββββββ | 2/3 [00:06<00:02, 2.57s/it]
|
217 |
67%|βββββββ | 2/3 [00:03<00:01, 1.42s/it]
|
218 |
|
219 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
@@ -179,19 +179,19 @@ To disable this warning, you can either:
|
|
179 |
- Avoid using `tokenizers` before the fork if possible
|
180 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
181 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
182 |
-
Using /home/
|
183 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
184 |
-
Using /home/
|
185 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
186 |
-
Using /home/
|
187 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
188 |
-
Using /home/
|
189 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
190 |
-
Using /home/
|
191 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
192 |
-
Using /home/
|
193 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
194 |
-
Using /home/
|
195 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
196 |
To disable this warning, you can either:
|
197 |
- Avoid using `tokenizers` before the fork if possible
|
@@ -205,16 +205,16 @@ To disable this warning, you can either:
|
|
205 |
- Avoid using `tokenizers` before the fork if possible
|
206 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
207 |
Detected CUDA files, patching ldflags
|
208 |
-
Emitting ninja build file /home/
|
209 |
Building extension module fused_adam...
|
210 |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
211 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
212 |
To disable this warning, you can either:
|
213 |
- Avoid using `tokenizers` before the fork if possible
|
214 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
215 |
-
[1/3] /usr/local/cuda-11.4/bin/nvcc -ccbin /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/
|
216 |
-
[2/3] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/
|
217 |
-
[3/3] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/home/
|
218 |
Loading extension module fused_adam...
|
219 |
Time to load fused_adam op: 37.18038511276245 seconds
|
220 |
Loading extension module fused_adam...
|
@@ -238,7 +238,7 @@ To disable this warning, you can either:
|
|
238 |
- Avoid using `tokenizers` before the fork if possible
|
239 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
240 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
241 |
-
Using /home/
|
242 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
243 |
To disable this warning, you can either:
|
244 |
- Avoid using `tokenizers` before the fork if possible
|
@@ -252,7 +252,7 @@ To disable this warning, you can either:
|
|
252 |
- Avoid using `tokenizers` before the fork if possible
|
253 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
254 |
Detected CUDA files, patching ldflags
|
255 |
-
Emitting ninja build file /home/
|
256 |
Building extension module fused_adam...
|
257 |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
258 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
@@ -274,14 +274,14 @@ Time to load fused_adam op: 6.415344715118408 seconds
|
|
274 |
[2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:134:__init__] Allgather bucket size 500,000,000
|
275 |
[2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:135:__init__] CPU Offload: False
|
276 |
[2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:136:__init__] Round robin gradient partitioning: False
|
277 |
-
Using /home/
|
278 |
-
Using /home/
|
279 |
-
Using /home/
|
280 |
-
Using /home/
|
281 |
-
Using /home/
|
282 |
-
Using /home/
|
283 |
-
Using /home/
|
284 |
-
Using /home/
|
285 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
286 |
To disable this warning, you can either:
|
287 |
- Avoid using `tokenizers` before the fork if possible
|
@@ -294,15 +294,15 @@ huggingface/tokenizers: The current process just got forked, after parallelism h
|
|
294 |
To disable this warning, you can either:
|
295 |
- Avoid using `tokenizers` before the fork if possible
|
296 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
297 |
-
Emitting ninja build file /home/
|
298 |
Building extension module utils...
|
299 |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
300 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
301 |
To disable this warning, you can either:
|
302 |
- Avoid using `tokenizers` before the fork if possible
|
303 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
304 |
-
[1/2] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx -MMD -MF flatten_unflatten.o.d -DTORCH_EXTENSION_NAME=utils -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/
|
305 |
-
[2/2] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx flatten_unflatten.o -shared -L/home/
|
306 |
Loading extension module utils...
|
307 |
Time to load utils op: 21.48611044883728 seconds
|
308 |
Loading extension module utils...
|
@@ -327,22 +327,22 @@ Rank: 7 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
|
|
327 |
Rank: 4 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
|
328 |
Rank: 6 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
|
329 |
Rank: 2 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
|
330 |
-
Using /home/
|
331 |
-
Using /home/
|
332 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
333 |
Loading extension module utils...
|
334 |
Time to load utils op: 0.0016155242919921875 seconds
|
335 |
-
Using /home/
|
336 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
337 |
Loading extension module utils...
|
338 |
Time to load utils op: 0.0008933544158935547 seconds
|
339 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
340 |
Loading extension module utils...
|
341 |
Time to load utils op: 0.0008301734924316406 seconds
|
342 |
-
Using /home/
|
343 |
-
Using /home/
|
344 |
-
Using /home/
|
345 |
-
Using /home/
|
346 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
347 |
Loading extension module utils...
|
348 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
@@ -514,7 +514,7 @@ Time to load utils op: 0.0009191036224365234 seconds
|
|
514 |
"tp_gather_partition_size": 8
|
515 |
}
|
516 |
}
|
517 |
-
Using /home/
|
518 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
519 |
Loading extension module utils...
|
520 |
Time to load utils op: 0.0014319419860839844 seconds
|
|
|
1 |
+
/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
|
2 |
warnings.warn(
|
3 |
[2023-04-14 06:58:31,332] [WARNING] [runner.py:190:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
|
4 |
+
[2023-04-14 06:58:32,784] [INFO] [runner.py:540:main] cmd = /home/AdamG012/.conda/envs/py39/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets openai/webgpt_comparisons stanfordnlp/SHP --data_split 2,4,4 --model_name_or_path facebook/opt-1.3b --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0.1 --num_train_epochs 2 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --zero_stage 2 --deepspeed --output_dir /lus/chatgpt/hf_runs/DeepSpeedExamples/applications/DeepSpeed-Chat/output/actor-models/1.3b
|
5 |
+
/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
|
6 |
warnings.warn(
|
7 |
[2023-04-14 06:59:25,659] [INFO] [launch.py:229:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}
|
8 |
[2023-04-14 06:59:25,760] [INFO] [launch.py:235:main] nnodes=1, num_local_procs=8, node_rank=0
|
9 |
[2023-04-14 06:59:25,760] [INFO] [launch.py:246:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]})
|
10 |
[2023-04-14 06:59:25,760] [INFO] [launch.py:247:main] dist_world_size=8
|
11 |
[2023-04-14 06:59:25,760] [INFO] [launch.py:249:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
12 |
+
/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
|
13 |
warnings.warn(
|
14 |
+
/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
|
15 |
warnings.warn(
|
16 |
+
/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
|
17 |
warnings.warn(
|
18 |
+
/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
|
19 |
warnings.warn(
|
20 |
+
/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
|
21 |
warnings.warn(
|
22 |
+
/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
|
23 |
warnings.warn(
|
24 |
+
/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
|
25 |
warnings.warn(
|
26 |
+
/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
|
27 |
warnings.warn(
|
28 |
[2023-04-14 07:04:01,148] [INFO] [comm.py:586:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
|
|
|
|
29 |
0%| | 0/2 [00:00<?, ?it/s]
|
30 |
50%|βββββ | 1/2 [00:00<00:00, 8.62it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
31 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
32 |
+
|
33 |
0%| | 0/2 [00:00<?, ?it/s]
|
34 |
50%|βββββ | 1/2 [00:00<00:00, 8.62it/s]Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
35 |
|
36 |
0%| | 0/2 [00:00<?, ?it/s]
|
37 |
|
38 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
39 |
|
40 |
0%| | 0/2 [00:00<?, ?it/s]
|
41 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
42 |
|
43 |
0%| | 0/2 [00:00<?, ?it/s]
|
44 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
45 |
|
46 |
0%| | 0/2 [00:00<?, ?it/s]
|
47 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
48 |
|
49 |
0%| | 0/2 [00:00<?, ?it/s]
|
50 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
51 |
|
52 |
0%| | 0/2 [00:00<?, ?it/s]
|
53 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
54 |
|
55 |
0%| | 0/2 [00:00<?, ?it/s]
|
|
|
|
|
56 |
0%| | 0/2 [00:00<?, ?it/s]
|
57 |
50%|βββββ | 1/2 [00:02<00:02, 2.37s/it]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
58 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
59 |
+
|
60 |
0%| | 0/2 [00:00<?, ?it/s]
|
61 |
50%|βββββ | 1/2 [00:02<00:02, 2.37s/it]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
62 |
|
63 |
|
64 |
0%| | 0/2 [00:00<?, ?it/s]
|
65 |
50%|βββββ | 1/2 [00:00<00:00, 1.76it/s]
|
66 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
67 |
|
68 |
0%| | 0/2 [00:00<?, ?it/s]
|
|
|
|
|
69 |
0%| | 0/2 [00:00<?, ?it/s]
|
70 |
50%|βββββ | 1/2 [00:01<00:01, 1.92s/it]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
71 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
72 |
+
|
73 |
0%| | 0/2 [00:00<?, ?it/s]
|
74 |
50%|βββββ | 1/2 [00:01<00:01, 1.92s/it]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
75 |
|
76 |
|
77 |
0%| | 0/2 [00:00<?, ?it/s]
|
|
|
|
|
78 |
0%| | 0/2 [00:00<?, ?it/s]
|
79 |
50%|βββββ | 1/2 [00:00<00:00, 1.60it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
80 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
81 |
+
|
82 |
0%| | 0/2 [00:00<?, ?it/s]
|
83 |
50%|βββββ | 1/2 [00:00<00:00, 1.60it/s]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
84 |
|
85 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
86 |
|
87 |
0%| | 0/2 [00:00<?, ?it/s]
|
88 |
|
89 |
0%| | 0/2 [00:00<?, ?it/s]
|
|
|
|
|
90 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
91 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
92 |
+
|
93 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
94 |
|
|
|
95 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
96 |
+
|
97 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
98 |
|
99 |
|
100 |
0%| | 0/1 [00:00<?, ?it/s]
|
101 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
102 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
103 |
|
104 |
0%| | 0/1 [00:00<?, ?it/s]
|
105 |
|
106 |
0%| | 0/1 [00:00<?, ?it/s]
|
107 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
108 |
|
109 |
0%| | 0/1 [00:00<?, ?it/s]
|
110 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
111 |
|
112 |
0%| | 0/1 [00:00<?, ?it/s]
|
113 |
+
Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
114 |
|
115 |
0%| | 0/1 [00:00<?, ?it/s]
|
|
|
|
|
116 |
0%| | 0/2 [00:00<?, ?it/s]
|
117 |
50%|βββββ | 1/2 [00:00<00:00, 5.87it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
118 |
+
Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
119 |
+
|
120 |
0%| | 0/2 [00:00<?, ?it/s]
|
121 |
50%|βββββ | 1/2 [00:00<00:00, 5.87it/s]Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
122 |
|
123 |
0%| | 0/2 [00:00<?, ?it/s]
|
124 |
|
125 |
+
Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
126 |
|
127 |
0%| | 0/2 [00:00<?, ?it/s]
|
128 |
+
Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
129 |
|
130 |
0%| | 0/2 [00:00<?, ?it/s]
|
131 |
+
Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
132 |
|
133 |
0%| | 0/2 [00:00<?, ?it/s]
|
134 |
+
Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
135 |
|
136 |
0%| | 0/2 [00:00<?, ?it/s]
|
137 |
+
Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
138 |
|
139 |
0%| | 0/2 [00:00<?, ?it/s]
|
140 |
50%|βββββ | 1/2 [00:00<00:00, 3.00it/s]
|
141 |
+
Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
|
142 |
|
143 |
0%| | 0/2 [00:00<?, ?it/s]
|
|
|
|
|
144 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
145 |
+
Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
146 |
+
|
147 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
148 |
|
149 |
0%| | 0/1 [00:00<?, ?it/s]
|
150 |
|
151 |
+
Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
152 |
|
153 |
0%| | 0/1 [00:00<?, ?it/s]
|
|
|
|
|
154 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
155 |
+
Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
156 |
+
|
157 |
0%| | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
158 |
|
159 |
|
160 |
0%| | 0/1 [00:00<?, ?it/s]
|
161 |
+
Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
162 |
|
163 |
0%| | 0/1 [00:00<?, ?it/s]
|
164 |
+
Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
165 |
|
166 |
0%| | 0/1 [00:00<?, ?it/s]
|
167 |
+
Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
|
168 |
|
169 |
0%| | 0/1 [00:00<?, ?it/s]
|
|
|
|
|
170 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
|
|
171 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
|
|
|
|
172 |
0%| | 0/3 [00:00<?, ?it/s]
|
173 |
33%|ββββ | 1/3 [00:11<00:22, 11.28s/it]
|
174 |
33%|ββββ | 1/3 [00:08<00:17, 8.75s/it]
|
175 |
33%|ββββ | 1/3 [00:01<00:02, 1.24s/it]
|
176 |
0%| | 0/3 [00:00<?, ?it/s]
|
177 |
67%|βββββββ | 2/3 [00:02<00:01, 1.03s/it]
|
178 |
67%|βββββββ | 2/3 [00:09<00:04, 4.31s/it]
|
179 |
67%|βββββββ | 2/3 [00:12<00:05, 5.35s/it]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
|
|
180 |
33%|ββββ | 1/3 [00:03<00:07, 3.78s/it]
|
181 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
182 |
+
Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
183 |
+
|
184 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
185 |
+
|
186 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
187 |
+
Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
188 |
+
|
189 |
0%| | 0/3 [00:00<?, ?it/s]
|
190 |
33%|ββββ | 1/3 [00:11<00:22, 11.28s/it]
|
191 |
33%|ββββ | 1/3 [00:08<00:17, 8.75s/it]
|
192 |
33%|ββββ | 1/3 [00:01<00:02, 1.24s/it]
|
193 |
0%| | 0/3 [00:00<?, ?it/s]
|
194 |
67%|βββββββ | 2/3 [00:02<00:01, 1.03s/it]
|
195 |
67%|βββββββ | 2/3 [00:09<00:04, 4.31s/it]
|
196 |
67%|βββββββ | 2/3 [00:12<00:05, 5.35s/it]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
197 |
+
|
198 |
33%|ββββ | 1/3 [00:03<00:07, 3.78s/it]
|
199 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
200 |
|
201 |
33%|ββββ | 1/3 [00:00<00:00, 5.25it/s]
|
202 |
|
203 |
|
204 |
|
205 |
|
206 |
|
207 |
0%| | 0/3 [00:00<?, ?it/s]
|
|
|
|
|
208 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
209 |
+
Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
210 |
+
|
211 |
0%| | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
|
212 |
|
213 |
0%| | 0/3 [00:00<?, ?it/s]
|
214 |
33%|ββββ | 1/3 [00:05<00:11, 5.84s/it]
|
215 |
33%|ββββ | 1/3 [00:03<00:06, 3.03s/it]
|
216 |
67%|βββββββ | 2/3 [00:06<00:02, 2.57s/it]
|
217 |
67%|βββββββ | 2/3 [00:03<00:01, 1.42s/it]
|
218 |
|
219 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
|
|
179 |
- Avoid using `tokenizers` before the fork if possible
|
180 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
181 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
182 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
183 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
184 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
185 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
186 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
187 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
188 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
189 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
190 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
191 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
192 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
193 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
194 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
195 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
196 |
To disable this warning, you can either:
|
197 |
- Avoid using `tokenizers` before the fork if possible
|
|
|
205 |
- Avoid using `tokenizers` before the fork if possible
|
206 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
207 |
Detected CUDA files, patching ldflags
|
208 |
+
Emitting ninja build file /home/AdamG012/.cache/torch_extensions/py39_cu113/fused_adam/build.ninja...
|
209 |
Building extension module fused_adam...
|
210 |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
211 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
212 |
To disable this warning, you can either:
|
213 |
- Avoid using `tokenizers` before the fork if possible
|
214 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
215 |
+
[1/3] /usr/local/cuda-11.4/bin/nvcc -ccbin /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/TH -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda-11.4/include -isystem /home/AdamG012/.conda/envs/py39/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -std=c++14 -c /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o
|
216 |
+
[2/3] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/TH -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda-11.4/include -isystem /home/AdamG012/.conda/envs/py39/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o
|
217 |
+
[3/3] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda-11.4/lib64 -lcudart -o fused_adam.so
|
218 |
Loading extension module fused_adam...
|
219 |
Time to load fused_adam op: 37.18038511276245 seconds
|
220 |
Loading extension module fused_adam...
|
|
|
238 |
- Avoid using `tokenizers` before the fork if possible
|
239 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
240 |
Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
|
241 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
242 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
243 |
To disable this warning, you can either:
|
244 |
- Avoid using `tokenizers` before the fork if possible
|
|
|
252 |
- Avoid using `tokenizers` before the fork if possible
|
253 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
254 |
Detected CUDA files, patching ldflags
|
255 |
+
Emitting ninja build file /home/AdamG012/.cache/torch_extensions/py39_cu113/fused_adam/build.ninja...
|
256 |
Building extension module fused_adam...
|
257 |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
258 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
|
|
274 |
[2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:134:__init__] Allgather bucket size 500,000,000
|
275 |
[2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:135:__init__] CPU Offload: False
|
276 |
[2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:136:__init__] Round robin gradient partitioning: False
|
277 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
278 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
279 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
280 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
281 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
282 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
283 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
284 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
285 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
286 |
To disable this warning, you can either:
|
287 |
- Avoid using `tokenizers` before the fork if possible
|
|
|
294 |
To disable this warning, you can either:
|
295 |
- Avoid using `tokenizers` before the fork if possible
|
296 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
297 |
+
Emitting ninja build file /home/AdamG012/.cache/torch_extensions/py39_cu113/utils/build.ninja...
|
298 |
Building extension module utils...
|
299 |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
300 |
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
|
301 |
To disable this warning, you can either:
|
302 |
- Avoid using `tokenizers` before the fork if possible
|
303 |
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
|
304 |
+
[1/2] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx -MMD -MF flatten_unflatten.o.d -DTORCH_EXTENSION_NAME=utils -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/TH -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/THC -isystem /home/AdamG012/.conda/envs/py39/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -c /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/utils/flatten_unflatten.cpp -o flatten_unflatten.o
|
305 |
+
[2/2] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx flatten_unflatten.o -shared -L/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o utils.so
|
306 |
Loading extension module utils...
|
307 |
Time to load utils op: 21.48611044883728 seconds
|
308 |
Loading extension module utils...
|
|
|
327 |
Rank: 4 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
|
328 |
Rank: 6 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
|
329 |
Rank: 2 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
|
330 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
331 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
332 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
333 |
Loading extension module utils...
|
334 |
Time to load utils op: 0.0016155242919921875 seconds
|
335 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
336 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
337 |
Loading extension module utils...
|
338 |
Time to load utils op: 0.0008933544158935547 seconds
|
339 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
340 |
Loading extension module utils...
|
341 |
Time to load utils op: 0.0008301734924316406 seconds
|
342 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
343 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
344 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
345 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
346 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
347 |
Loading extension module utils...
|
348 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
|
|
514 |
"tp_gather_partition_size": 8
|
515 |
}
|
516 |
}
|
517 |
+
Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
|
518 |
No modifications detected for re-loaded extension module utils, skipping build step...
|
519 |
Loading extension module utils...
|
520 |
Time to load utils op: 0.0014319419860839844 seconds
|