boris commited on
Commit
bf3640d
·
1 Parent(s): 38705a9

refactor: loop over runs

Browse files
Files changed (1) hide show
  1. dev/inference/wandb-backend.ipynb +98 -229
dev/inference/wandb-backend.ipynb CHANGED
@@ -13,6 +13,7 @@
13
  "import random\n",
14
  "import numpy as np\n",
15
  "from PIL import Image\n",
 
16
  "import jax\n",
17
  "import jax.numpy as jnp\n",
18
  "from flax.training.common_utils import shard, shard_prng_key\n",
@@ -47,18 +48,10 @@
47
  "num_images = 128\n",
48
  "top_k = 8\n",
49
  "text_normalizer = TextNormalizer() if normalize_text else None\n",
50
- "padding_item = 'NONE'"
51
- ]
52
- },
53
- {
54
- "cell_type": "code",
55
- "execution_count": null,
56
- "id": "6a045827-3461-4499-8959-38d173bc4e5e",
57
- "metadata": {},
58
- "outputs": [],
59
- "source": [
60
  "seed = random.randint(0, 2**32-1)\n",
61
- "key = jax.random.PRNGKey(seed)"
 
62
  ]
63
  },
64
  {
@@ -70,18 +63,26 @@
70
  "source": [
71
  "vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)\n",
72
  "clip = FlaxCLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
73
- "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")"
 
 
74
  ]
75
  },
76
  {
77
  "cell_type": "code",
78
  "execution_count": null,
79
- "id": "4927529a-8828-4150-bc76-e1b60d8dee62",
80
  "metadata": {},
81
  "outputs": [],
82
  "source": [
83
- "clip_params = replicate(clip.params)\n",
84
- "vqgan_params = replicate(vqgan.params)"
 
 
 
 
 
 
85
  ]
86
  },
87
  {
@@ -103,36 +104,6 @@
103
  " samples = [samples[i:i+batch_size] for i in range(0, len(samples), batch_size)]"
104
  ]
105
  },
106
- {
107
- "cell_type": "code",
108
- "execution_count": null,
109
- "id": "f75b2869-fc25-4f56-b937-e97bbb712ede",
110
- "metadata": {},
111
- "outputs": [],
112
- "source": [
113
- "len(samples)"
114
- ]
115
- },
116
- {
117
- "cell_type": "code",
118
- "execution_count": null,
119
- "id": "c48525c9-447a-4430-81d7-4b699f545638",
120
- "metadata": {},
121
- "outputs": [],
122
- "source": [
123
- "samples[-1]"
124
- ]
125
- },
126
- {
127
- "cell_type": "code",
128
- "execution_count": null,
129
- "id": "a2c629e9-1a82-40c6-a260-ca1780c19a2e",
130
- "metadata": {},
131
- "outputs": [],
132
- "source": [
133
- "api = wandb.Api()"
134
- ]
135
- },
136
  {
137
  "cell_type": "code",
138
  "execution_count": null,
@@ -142,7 +113,7 @@
142
  "source": [
143
  "# TODO: iterate on runs\n",
144
  "wandb_run = wandb_runs[0]\n",
145
- "functions_pmapped = False"
146
  ]
147
  },
148
  {
@@ -152,60 +123,12 @@
152
  "metadata": {},
153
  "outputs": [],
154
  "source": [
155
- "try:\n",
156
- " versions = api.artifact_versions(type_name='bart_model', name=f'dalle-mini/dalle-mini/model-{wandb_run}', per_page=10000)\n",
157
- "except:\n",
158
- " versions = []"
159
- ]
160
- },
161
- {
162
- "cell_type": "code",
163
- "execution_count": null,
164
- "id": "e8026e63-9e73-472c-9440-5e742c614901",
165
- "metadata": {},
166
- "outputs": [],
167
- "source": [
168
- "versions, len(versions)"
169
- ]
170
- },
171
- {
172
- "cell_type": "code",
173
- "execution_count": null,
174
- "id": "ead44aee-52d5-4ca2-8984-c4d267d9e72a",
175
- "metadata": {},
176
- "outputs": [],
177
- "source": [
178
- "versions[0].version"
179
- ]
180
- },
181
- {
182
- "cell_type": "code",
183
- "execution_count": null,
184
- "id": "cfd48de9-6022-444f-8b12-05cba8fad071",
185
- "metadata": {},
186
- "outputs": [],
187
- "source": [
188
- "artifact = versions[0]"
189
- ]
190
- },
191
- {
192
- "cell_type": "code",
193
- "execution_count": null,
194
- "id": "4db848c1-2bb5-432c-a732-1c6d0636e172",
195
- "metadata": {},
196
- "outputs": [],
197
- "source": [
198
- "version = int(artifact.version[1:])"
199
- ]
200
- },
201
- {
202
- "cell_type": "code",
203
- "execution_count": null,
204
- "id": "25fac577-146d-4e62-a3ea-f0baea79ef83",
205
- "metadata": {},
206
- "outputs": [],
207
- "source": [
208
- "version"
209
  ]
210
  },
211
  {
@@ -215,20 +138,10 @@
215
  "metadata": {},
216
  "outputs": [],
217
  "source": [
218
- "# retrieve training run\n",
219
- "training_run = api.run(f'dalle-mini/dalle-mini/{wandb_run}')\n",
220
- "config = training_run.config"
221
- ]
222
- },
223
- {
224
- "cell_type": "code",
225
- "execution_count": null,
226
- "id": "9b9393c6-0a3c-46a8-ba27-ba37982b0009",
227
- "metadata": {},
228
- "outputs": [],
229
- "source": [
230
- "# see summary metrics\n",
231
- "training_run.summary"
232
  ]
233
  },
234
  {
@@ -239,7 +152,7 @@
239
  "outputs": [],
240
  "source": [
241
  "# retrieve inference run details\n",
242
- "def get_last_version_inference(run_id):\n",
243
  " try:\n",
244
  " inference_run = api.run(f'dalle-mini/dalle-mini/inference-{run_id}')\n",
245
  " return inference_run.summary.get('_step', None)\n",
@@ -250,147 +163,103 @@
250
  {
251
  "cell_type": "code",
252
  "execution_count": null,
253
- "id": "93b8d869-1658-4fa4-a401-2b91f8ac7a11",
254
- "metadata": {},
255
- "outputs": [],
256
- "source": [
257
- "last_version_inference = get_last_version_inference(wandb_run)"
258
- ]
259
- },
260
- {
261
- "cell_type": "code",
262
- "execution_count": null,
263
- "id": "8324835e-fd94-408e-b106-138be308480b",
264
- "metadata": {},
265
- "outputs": [],
266
- "source": [
267
- "if last_version_inference is None:\n",
268
- " assert version == 0\n",
269
- "elif last_version_inference >= version:\n",
270
- " print(f'Version {version} has already been logged')\n",
271
- "else:\n",
272
- " assert version == last_version_inference + 1"
273
- ]
274
- },
275
- {
276
- "cell_type": "code",
277
- "execution_count": null,
278
- "id": "8ce9d2d3-aea3-4d5e-834a-c5caf85dd117",
279
- "metadata": {},
280
- "outputs": [],
281
- "source": [
282
- "run = wandb.init(job_type='inference', config=config, id=f'inference-{wandb_run}', resume='allow')"
283
- ]
284
- },
285
- {
286
- "cell_type": "code",
287
- "execution_count": null,
288
- "id": "ffe392c9-36d2-4aaa-a1b3-a827e348c1ef",
289
- "metadata": {},
290
- "outputs": [],
291
- "source": [
292
- "tmp_f.cleanup\n",
293
- "tmp_f = tempfile.TemporaryDirectory()\n",
294
- "tmp = tmp_f.name\n",
295
- "#TODO: use context manager"
296
- ]
297
- },
298
- {
299
- "cell_type": "code",
300
- "execution_count": null,
301
- "id": "562036ed-dc86-48af-90b1-9c18383b3552",
302
- "metadata": {},
303
- "outputs": [],
304
- "source": [
305
- "# remove tmp\n",
306
- "tmp_f.cleanup()"
307
- ]
308
- },
309
- {
310
- "cell_type": "code",
311
- "execution_count": null,
312
- "id": "299db1bb-fbe6-4d79-a48f-89893f8ed809",
313
- "metadata": {},
314
- "outputs": [],
315
- "source": [
316
- "artifact = run.use_artifact(artifact)"
317
- ]
318
- },
319
- {
320
- "cell_type": "code",
321
- "execution_count": null,
322
- "id": "d71481bf-98aa-42cb-b7e2-545d13ae4309",
323
- "metadata": {},
324
- "outputs": [],
325
- "source": [
326
- "# only download required files\n",
327
- "for f in ['config.json', 'flax_model.msgpack', 'merges.txt', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'vocab.json']:\n",
328
- " artifact.get_path(f).download(tmp)"
329
- ]
330
- },
331
- {
332
- "cell_type": "code",
333
- "execution_count": null,
334
- "id": "6f8ad8dd-da8f-40f9-b438-e43b779d637c",
335
  "metadata": {},
336
  "outputs": [],
337
  "source": [
338
- "# we verify all the files are present\n",
339
- "from pathlib import Path\n",
340
- "list(Path(tmp).glob('*'))"
 
 
 
 
 
 
 
 
 
 
 
341
  ]
342
  },
343
  {
344
  "cell_type": "code",
345
  "execution_count": null,
346
- "id": "5b715c32-e757-4cb0-9912-ff90238b9f10",
347
  "metadata": {},
348
  "outputs": [],
349
  "source": [
350
- "tokenizer = BartTokenizer.from_pretrained(tmp)\n",
351
- "model = CustomFlaxBartForConditionalGeneration.from_pretrained(tmp)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  ]
353
  },
354
  {
355
  "cell_type": "code",
356
  "execution_count": null,
357
- "id": "320823c9-124a-4fc3-a12c-8c015a128285",
358
  "metadata": {},
359
  "outputs": [],
360
  "source": [
361
- "model_params = replicate(model.params)"
362
  ]
363
  },
364
  {
365
  "cell_type": "code",
366
  "execution_count": null,
367
- "id": "d1cc9993-1bfc-4ec6-a004-c056189c42ac",
368
  "metadata": {},
369
  "outputs": [],
370
  "source": [
371
- "# function to generate encoded images\n",
372
- "# we should generate this function only once per run\n",
373
- "if not functions_pmapped:\n",
374
- " @partial(jax.pmap, axis_name=\"batch\")\n",
375
- " def p_generate(tokenized_prompt, key, params):\n",
376
- " return model.generate(\n",
377
- " **tokenized_prompt,\n",
378
- " do_sample=True,\n",
379
- " num_beams=1,\n",
380
- " prng_key=key,\n",
381
- " params=params\n",
382
- " )\n",
383
- " \n",
384
- " @partial(jax.pmap, axis_name=\"batch\")\n",
385
- " def p_decode(indices, params):\n",
386
- " return vqgan.decode_code(indices, params=params)\n",
387
- " \n",
388
- " @partial(jax.pmap, axis_name=\"batch\")\n",
389
- " def p_clip(inputs):\n",
390
- " logits = clip(**inputs).logits_per_image\n",
391
- " return logits\n",
392
- " \n",
393
- " functions_pmapped = False"
394
  ]
395
  },
396
  {
 
13
  "import random\n",
14
  "import numpy as np\n",
15
  "from PIL import Image\n",
16
+ "from tqdm import tqdm\n",
17
  "import jax\n",
18
  "import jax.numpy as jnp\n",
19
  "from flax.training.common_utils import shard, shard_prng_key\n",
 
48
  "num_images = 128\n",
49
  "top_k = 8\n",
50
  "text_normalizer = TextNormalizer() if normalize_text else None\n",
51
+ "padding_item = 'NONE'\n",
 
 
 
 
 
 
 
 
 
52
  "seed = random.randint(0, 2**32-1)\n",
53
+ "key = jax.random.PRNGKey(seed)\n",
54
+ "api = wandb.Api()"
55
  ]
56
  },
57
  {
 
63
  "source": [
64
  "vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)\n",
65
  "clip = FlaxCLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
66
+ "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
67
+ "clip_params = replicate(clip.params)\n",
68
+ "vqgan_params = replicate(vqgan.params)"
69
  ]
70
  },
71
  {
72
  "cell_type": "code",
73
  "execution_count": null,
74
+ "id": "a500dd07-dbc3-477d-80d4-2b73a3b83ef3",
75
  "metadata": {},
76
  "outputs": [],
77
  "source": [
78
+ "@partial(jax.pmap, axis_name=\"batch\")\n",
79
+ "def p_decode(indices, params):\n",
80
+ " return vqgan.decode_code(indices, params=params)\n",
81
+ "\n",
82
+ "@partial(jax.pmap, axis_name=\"batch\")\n",
83
+ "def p_clip(inputs):\n",
84
+ " logits = clip(**inputs).logits_per_image\n",
85
+ " return logits"
86
  ]
87
  },
88
  {
 
104
  " samples = [samples[i:i+batch_size] for i in range(0, len(samples), batch_size)]"
105
  ]
106
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  {
108
  "cell_type": "code",
109
  "execution_count": null,
 
113
  "source": [
114
  "# TODO: iterate on runs\n",
115
  "wandb_run = wandb_runs[0]\n",
116
+ "model_pmapped = False"
117
  ]
118
  },
119
  {
 
123
  "metadata": {},
124
  "outputs": [],
125
  "source": [
126
+ "def get_artifact_versions(run_id):\n",
127
+ " try:\n",
128
+ " versions = api.artifact_versions(type_name='bart_model', name=f'dalle-mini/dalle-mini/model-{run_id}', per_page=10000)\n",
129
+ " except:\n",
130
+ " versions = []\n",
131
+ " return versions"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  ]
133
  },
134
  {
 
138
  "metadata": {},
139
  "outputs": [],
140
  "source": [
141
+ "def get_training_config(run_id):\n",
142
+ " training_run = api.run(f'dalle-mini/dalle-mini/{run_id}')\n",
143
+ " config = training_run.config\n",
144
+ " return config"
 
 
 
 
 
 
 
 
 
 
145
  ]
146
  },
147
  {
 
152
  "outputs": [],
153
  "source": [
154
  "# retrieve inference run details\n",
155
+ "def get_last_inference_version(run_id):\n",
156
  " try:\n",
157
  " inference_run = api.run(f'dalle-mini/dalle-mini/inference-{run_id}')\n",
158
  " return inference_run.summary.get('_step', None)\n",
 
163
  {
164
  "cell_type": "code",
165
  "execution_count": null,
166
+ "id": "d1cc9993-1bfc-4ec6-a004-c056189c42ac",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  "metadata": {},
168
  "outputs": [],
169
  "source": [
170
+ "# compile functions - needed only once per run\n",
171
+ "def pmap_model_function(model):\n",
172
+ " \n",
173
+ " @partial(jax.pmap, axis_name=\"batch\")\n",
174
+ " def _generate(tokenized_prompt, key, params):\n",
175
+ " return model.generate(\n",
176
+ " **tokenized_prompt,\n",
177
+ " do_sample=True,\n",
178
+ " num_beams=1,\n",
179
+ " prng_key=key,\n",
180
+ " params=params\n",
181
+ " )\n",
182
+ " \n",
183
+ " return _generate"
184
  ]
185
  },
186
  {
187
  "cell_type": "code",
188
  "execution_count": null,
189
+ "id": "bba70f33-af8b-4eb3-9973-7be672301a0b",
190
  "metadata": {},
191
  "outputs": [],
192
  "source": [
193
+ "def log_run(run_id):\n",
194
+ " artifact_versions = get_artifact_versions(run_id)\n",
195
+ " last_inference_version = get_last_inference_version(run_id)\n",
196
+ " training_config = get_training_config(run_id)\n",
197
+ " run = None\n",
198
+ " p_generate = None\n",
199
+ " model_files = ['config.json', 'flax_model.msgpack', 'merges.txt', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'vocab.json']\n",
200
+ " for artifact in artifact_versions:\n",
201
+ " print(f'Processing artifact: {artifact.name}')\n",
202
+ " version = int(artifact.version[1:])\n",
203
+ " if last_version_inference is None:\n",
204
+ " # we should start from v0\n",
205
+ " assert version == 0\n",
206
+ " elif version <= last_version_inference:\n",
207
+ " print(f'v{version} has already been logged (versions logged up to v{last_version_inference}')\n",
208
+ " else:\n",
209
+ " # check we are logging the correct version\n",
210
+ " assert version == last_version_inference + 1\n",
211
+ " \n",
212
+ " # start/resume corresponding run\n",
213
+ " if run is None:\n",
214
+ " run = wandb.init(job_type='inference', config=config, id=f'inference-{wandb_run}', resume='allow')\n",
215
+ " \n",
216
+ " # work in temporary directory\n",
217
+ " with tempfile.TemporaryDirectory() as tmp:\n",
218
+ " \n",
219
+ " # download model files\n",
220
+ " artifact = run.use_artifact(artifact)\n",
221
+ " for f in model_files:\n",
222
+ " artifact.get_path(f).download(tmp)\n",
223
+ " \n",
224
+ " # load tokenizer and model\n",
225
+ " tokenizer = BartTokenizer.from_pretrained(tmp)\n",
226
+ " model = CustomFlaxBartForConditionalGeneration.from_pretrained(tmp)\n",
227
+ " model_params = replicate(model.params)\n",
228
+ " \n",
229
+ " # pmap model function needs to happen only once per model config\n",
230
+ " if p_generate is None:\n",
231
+ " p_generate = pmap_model_function(model)\n",
232
+ " \n",
233
+ " for batch in tqdm(samples):\n",
234
+ " prompts = [x['Caption'] for x in batch]\n",
235
+ " processed_prompts = [text_normalizer(x) for x in prompts] if normalize_text else prompts\n",
236
+ " \n",
237
+ "\n",
238
+ " \n",
239
+ " \n",
240
+ " "
241
  ]
242
  },
243
  {
244
  "cell_type": "code",
245
  "execution_count": null,
246
+ "id": "4d542342-3232-48a5-a0aa-3cb5c157aa8c",
247
  "metadata": {},
248
  "outputs": [],
249
  "source": [
250
+ "log_run(wandb_run)"
251
  ]
252
  },
253
  {
254
  "cell_type": "code",
255
  "execution_count": null,
256
+ "id": "4e4c7d0c-2848-4f88-b967-82fd571534f1",
257
  "metadata": {},
258
  "outputs": [],
259
  "source": [
260
+ "def log_runs(runs):\n",
261
+ " for run in tqdm(runs):\n",
262
+ " log_run(run)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  ]
264
  },
265
  {