nbroad HF staff commited on
Commit
d2a60ad
·
1 Parent(s): f7ff38b

Upload 2 files

Browse files
Files changed (2) hide show
  1. data.py +227 -0
  2. infer.py +408 -0
data.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import multiprocessing
4
+ from pathlib import Path
5
+ from typing import Dict, List
6
+
7
+ from datasets import load_dataset, Dataset
8
+ from transformers import AutoTokenizer
9
+
10
+
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
+
13
+
14
+ DATASET_NAME_PATTERN = re.compile(r"[^a-zA-Z0-9]")
15
+
16
+
17
+ def download_dataset(
18
+ ds_name: str,
19
+ ds_config: str = None,
20
+ ds_split: str = "train",
21
+ ):
22
+ """
23
+ Download a dataset from the HuggingFace Hub. Will only save the
24
+
25
+ Args:
26
+ ds_name (`str`):
27
+ The name of the dataset to load.
28
+ ds_config (`str`, *optional*, Defaults to `None`):
29
+ The configuration of the dataset to load.
30
+ ds_split (`str`, *optional*, Defaults to `"train"`):
31
+ The split of the dataset to load.
32
+
33
+ Returns:
34
+ len(ds) (`int`):
35
+ The number of rows in the dataset.
36
+ """
37
+ if ds_name == "wikipedia":
38
+ ds = load_wikipedia(ds_name, ds_config)
39
+ else:
40
+ if ds_config == "":
41
+ ds_config = None
42
+ ds = load_dataset(ds_name, ds_config, split=ds_split)
43
+
44
+ chunk_and_save_dataset(
45
+ ds, ds_name=ds_name, ds_config=ds_config, suffix=f"_{ds_split}_raw"
46
+ )
47
+
48
+ return len(ds)
49
+
50
+
51
+ def load_wikipedia(ds_name, ds_config):
52
+ """
53
+ Stream the wikipedia dataset from the HuggingFace Hub.
54
+
55
+ Args:
56
+ ds_name (`str`):
57
+ The name of the dataset to load. Must be `"wikipedia"`.
58
+ ds_config (`str`, *optional*, Defaults to `None`):
59
+ The configuration of the dataset to load.
60
+
61
+ Returns:
62
+ ds (`datasets.Dataset`):
63
+ """
64
+ ds = load_dataset(ds_name, ds_config, streaming=True, split="train")
65
+
66
+ def gen():
67
+ for example in ds:
68
+ yield {"text": example["text"]}
69
+
70
+ return Dataset.from_generator(gen)
71
+
72
+
73
+ def chunk_and_save_dataset(
74
+ ds: Dataset,
75
+ chunk_size: int = 20_000,
76
+ ds_name: str = None,
77
+ ds_config: str = None,
78
+ suffix: str = "",
79
+ ):
80
+ """
81
+ Chunk a dataset into smaller datasets of size `chunk_size`.
82
+ The name of the dataset will be used to create a folder in `/data`.
83
+
84
+ Args:
85
+ ds (`Dataset`):
86
+ The dataset to chunk.
87
+ chunk_size (`int`, *optional*, Defaults to `20_000`):
88
+ The size of each chunk. Defaults to `20_000`.
89
+ ds_name (`str`, *optional*, Defaults to `None`):
90
+ The name of the dataset to load.
91
+ ds_config (`str`, *optional*, Defaults to `None`):
92
+ The configuration of the dataset to load.
93
+ suffix (`str`, *optional*, Defaults to `""`):
94
+ The suffix to add to the dataset name.
95
+
96
+
97
+ Returns:
98
+ chunks (`List[Dataset]`):
99
+ The list of chunks.
100
+ """
101
+
102
+ if ds_config is None:
103
+ ds_config = ""
104
+
105
+ folder = Path("/data") / DATASET_NAME_PATTERN.sub("", ds_name + ds_config)
106
+ folder.mkdir(exist_ok=True, parents=True)
107
+
108
+ for chunk_num, start_idx in enumerate(range(0, len(ds), chunk_size)):
109
+ end_idx = min(start_idx + chunk_size, len(ds))
110
+
111
+ temp = ds.select(range(start_idx, end_idx))
112
+
113
+ temp.to_parquet(str(folder / f"chunk_{chunk_num}{suffix}"))
114
+
115
+
116
+ def tokenize_dataset(
117
+ ds_name: str,
118
+ ds_config: str = None,
119
+ ds_split: str = "train",
120
+ model_name: str = None,
121
+ opt_level: str = None,
122
+ column_name: str = "text",
123
+ num2skip: int = 0,
124
+ num2embed: int = -1,
125
+ ):
126
+ """
127
+ Tokenize the examples using the tokenizer. Sort by length
128
+
129
+ Args:
130
+ ds_name (`str`):
131
+ The name of the dataset to load.
132
+
133
+ ds_config (`str`, *optional*, Defaults to `None`):
134
+ The configuration of the dataset to load.
135
+
136
+ model_name (`str`, *optional*, Defaults to `None`):
137
+ The name of the model to use for tokenization.
138
+
139
+ opt_level (`str`, *optional*, Defaults to `None`):
140
+ The optimization level to use for tokenization.
141
+
142
+ column_name (`str`, *optional*, defaults to `text`):
143
+ column name to use for tokenization. Defaults to `text`
144
+
145
+ num2skip (`int`, *optional*, defaults to `0`):
146
+ number of rows to skip. Defaults to `0`
147
+
148
+ num2embed (`int`, *optional*, defaults to `-1`):
149
+ number of rows to embed. Defaults to `-1`, which means all rows.
150
+
151
+ Returns:
152
+ ds (`Dataset`):
153
+ """
154
+
155
+ # TODO: option for controlling length for models that can go shorter/longer than 512
156
+
157
+ folder = Path("/data") / DATASET_NAME_PATTERN.sub("", ds_name + ds_config)
158
+ files = list(map(str, folder.glob(f"chunk_*_{ds_split}_raw")))
159
+
160
+ ds = load_dataset("parquet", data_files=files, split="train")
161
+
162
+ if num2embed == -1:
163
+ num2embed = len(ds)
164
+ ds = ds.select(range(num2skip, num2skip + num2embed))
165
+
166
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
167
+
168
+ padding = "max_length" if opt_level == "O4" else False
169
+ max_length = 512
170
+
171
+ def tokenize(
172
+ examples: Dict[str, List[str]],
173
+ ):
174
+ tokenized = tokenizer(
175
+ examples[column_name],
176
+ truncation=True,
177
+ padding=padding,
178
+ max_length=max_length,
179
+ )
180
+ tokenized["length"] = [len(x) for x in tokenized["input_ids"]]
181
+
182
+ return tokenized
183
+
184
+ tds = ds.map(
185
+ tokenize,
186
+ batched=True,
187
+ batch_size=1000,
188
+ remove_columns=set(ds.column_names) - {column_name},
189
+ num_proc=multiprocessing.cpu_count(),
190
+ desc="Tokenizing",
191
+ )
192
+
193
+ # sort to minimize padding
194
+ if padding != "max_length":
195
+ tds = tds.sort("length")
196
+
197
+ chunk_and_save_dataset(
198
+ tds, ds_name=ds_name, ds_config=ds_config, suffix=f"_{ds_split}_tokenized"
199
+ )
200
+
201
+
202
+ def load_tokenized_dataset(
203
+ ds_name: str,
204
+ ds_config: str = None,
205
+ ds_split: str = "train",
206
+ ):
207
+ """
208
+ Load a tokenized dataset from disk.
209
+
210
+ Args:
211
+ ds_name (`str`):
212
+ The name of the dataset to load.
213
+
214
+ ds_config (`str`, *optional*, Defaults to `None`):
215
+ The configuration of the dataset to load.
216
+
217
+ ds_split (`str`, *optional*, Defaults to `"train"`):
218
+ The split of the dataset to load.
219
+
220
+ Returns:
221
+ ds (`Dataset`):
222
+ """
223
+
224
+ folder = Path("/data") / DATASET_NAME_PATTERN.sub("", ds_name + ds_config)
225
+ files = list(map(str, folder.glob(f"chunk_*_{ds_split}_tokenized")))
226
+
227
+ return load_dataset("parquet", data_files=files, split="train")
infer.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import shutil
4
+ from pathlib import Path
5
+ from functools import partial
6
+ from typing import Union, Dict, List
7
+
8
+ import torch
9
+ from torch.utils.data import DataLoader
10
+ import datasets
11
+ from datasets import load_dataset, Dataset
12
+ from transformers import AutoTokenizer, PreTrainedTokenizer, DataCollatorWithPadding
13
+ from huggingface_hub import Repository, create_repo, HfApi
14
+ from optimum.onnxruntime import (
15
+ AutoOptimizationConfig,
16
+ ORTModelForFeatureExtraction,
17
+ ORTOptimizer,
18
+ )
19
+
20
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
21
+
22
+ opt_configs = {
23
+ "O2": AutoOptimizationConfig.O2(),
24
+ "O3": AutoOptimizationConfig.O3(),
25
+ "O4": AutoOptimizationConfig.O4(),
26
+ }
27
+
28
+
29
+ def get_batch_size(device_name: str, model_name: str, opt_level: str):
30
+ """
31
+ TODO: run actual tests
32
+
33
+ T4 has 16GB
34
+ A10 has 24GB
35
+
36
+ Args:
37
+ device_name (`str`):
38
+ The name of the GPU device in use.
39
+ model_name (`str`):
40
+ The name of the model in use.
41
+ opt_level (`str`):
42
+ The optimization level in use.
43
+
44
+ Returns:
45
+ `int`:
46
+ The batch size to use.
47
+ """
48
+
49
+ if "small" in model_name:
50
+ bs = 192
51
+ elif "base" in model_name:
52
+ bs = 128
53
+ elif "large" in model_name:
54
+ bs = 64
55
+ else:
56
+ bs = 32
57
+
58
+ if "A10" in device_name:
59
+ bs *= 2
60
+
61
+ if opt_level == "O4":
62
+ bs *= 2
63
+
64
+ return bs
65
+
66
+
67
+ def mean_pooling(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor):
68
+ """
69
+ Mean pool the token embeddings.
70
+
71
+ Args:
72
+ last_hidden_state (`tuple`):
73
+ The output of the model.
74
+ attention_mask (`torch.Tensor`):
75
+ The attention mask.
76
+
77
+ Returns:
78
+ `torch.Tensor`:
79
+ The mean pooled embeddings.
80
+ """
81
+ input_mask_expanded = (
82
+ attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
83
+ )
84
+ return torch.sum(last_hidden_state * input_mask_expanded, 1) / torch.clamp(
85
+ input_mask_expanded.sum(1), min=1e-9
86
+ )
87
+
88
+
89
+ def get_model_and_tokenizer(model_name: str, optimization_level: str, progress):
90
+ """
91
+ Load the model and tokenizer from the HuggingFace Hub.
92
+
93
+ If the model is not already optimized, optimize it and save it to the local directory.
94
+
95
+ Args:
96
+ model_name (`str`):
97
+ The name of the model to load.
98
+ optimization_level (`str`):
99
+ The optimization level to use. Should be one of `"O2"`, `"O3"`, or `"O4"`.
100
+
101
+ Returns:
102
+ model (`ORTModelForFeatureExtraction`):
103
+ The optimized model.
104
+ tokenizer (`PreTrainedTokenizer`):
105
+ The tokenizer.
106
+ """
107
+ optimized_model_name = f"model_optimized_{optimization_level}.onnx"
108
+
109
+ model_dir = Path(model_name.replace("/", "_"))
110
+ if not (model_dir / optimized_model_name).exists():
111
+ if progress is not None:
112
+ progress(0.2, "Downloading tokenizer...")
113
+
114
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
115
+ tokenizer.save_pretrained(model_dir)
116
+
117
+ if progress is not None:
118
+ progress(0.4, "Downloading model...")
119
+
120
+ model = ORTModelForFeatureExtraction.from_pretrained(model_name, export=True)
121
+ model.save_pretrained(model_dir)
122
+
123
+ optimizer = ORTOptimizer.from_pretrained(model)
124
+ optimization_config = opt_configs[optimization_level]
125
+
126
+ if progress is not None:
127
+ progress(0.6, "Optimizing model...")
128
+
129
+ optimizer.optimize(save_dir=model_dir, optimization_config=optimization_config)
130
+ Path(model_dir / "model_optimized.onnx").rename(
131
+ model_dir / optimized_model_name
132
+ )
133
+
134
+ else:
135
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
136
+
137
+ if progress is not None:
138
+ progress(0.8, "Loading optimized model and tokenizer...")
139
+
140
+ return (
141
+ ORTModelForFeatureExtraction.from_pretrained(
142
+ model_dir,
143
+ file_name=optimized_model_name,
144
+ provider="CUDAExecutionProvider",
145
+ ),
146
+ tokenizer,
147
+ )
148
+
149
+
150
+ # def collate_fn(examples, tokenizer=None, padding=None, column_name="text"):
151
+ # try:
152
+ # keys = examples[0].keys()
153
+ # except KeyError:
154
+ # print(examples)
155
+ # else:
156
+ # batch = {k: [] for k in examples[0].keys()}
157
+
158
+ # tokenized = tokenizer(
159
+ # [x[column_name] for x in examples],
160
+ # truncation=True,
161
+ # padding=padding,
162
+ # max_length=512,
163
+ # return_tensors="pt"
164
+ # )
165
+
166
+ # tokenized[column_name] = [x[column_name] for x in examples]
167
+
168
+ # return tokenized
169
+
170
+
171
+ @torch.inference_mode()
172
+ def batch_embed(
173
+ ds: datasets.IterableDataset,
174
+ model: ORTModelForFeatureExtraction,
175
+ tokenizer: PreTrainedTokenizer,
176
+ model_name: str,
177
+ column_name: str,
178
+ new_dataset_id: str,
179
+ opt_level: str,
180
+ upload_batch_size: int = 10_000,
181
+ map_batch_size: int = 2000,
182
+ num2skip: int = 0,
183
+ num2embed: int = -1,
184
+ progress=None,
185
+ ):
186
+ """
187
+ Run the model on the dataset and upload the embeddings to the hub.
188
+
189
+ Args:
190
+ ds (`datasets.Dataset`):
191
+ dataset to embed. From `load_hf_dataset`
192
+ model (`ORTModelForFeatureExtraction`):
193
+ model to use for embedding. From `get_model_and_tokenizer`
194
+ tokenizer (`AutoTokenizer`):
195
+ tokenizer to use for embedding. From `get_model_and_tokenizer`
196
+ model_name (`str`):
197
+ name of the model to use. Used to determine batch size.
198
+ column_name (`str`):
199
+ column name to use for embedding. Default option in gradio app is `text`
200
+ new_dataset_id (`str`):
201
+ id of the new dataset to create. Should include username or organization.
202
+ e.g. nbroad/new-embeddings
203
+ opt_level (`str`):
204
+ optimization level to use. Should be one of `O2`, `O3`, `O4`
205
+ See here for more details on optimization levels:
206
+ https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration
207
+ upload_batch_size (`int`, *optional*, defaults to `10_000`):
208
+ number of embeddings to upload at once. Defaults to 10,000.
209
+ map_batch_size (`int`, *optional*, defaults to `2000`):
210
+ number of examples to tokenize at once. Defaults to 2000.
211
+ num2skip (`int`, *optional*, defaults to `0`):
212
+ number of examples to skip. Defaults to 0.
213
+ num2embed (`int`, *optional*, defaults to `-1`):
214
+ number of examples to embed. Defaults to -1, which means all examples.
215
+
216
+ Returns:
217
+ current_count (`int`):
218
+ number of examples embedded so far
219
+ time_taken (`float`):
220
+ time taken to embed the examples in seconds
221
+
222
+ """
223
+
224
+ api = HfApi(
225
+ token=os.environ["HF_TOKEN"],
226
+ )
227
+
228
+ username = api.whoami()["name"]
229
+
230
+ if "/" not in new_dataset_id:
231
+ new_dataset_id = username + "/" + new_dataset_id
232
+
233
+ repo = init_git_repo(new_dataset_id)
234
+
235
+ embeds = []
236
+ texts = []
237
+
238
+ # current count keeps track of how many have been embedded in total
239
+ current_count = num2skip
240
+
241
+ # last_count keeps track of how many had been embedded since last push
242
+ last_count = current_count
243
+
244
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
245
+
246
+ inference_bs = get_batch_size(torch.cuda.get_device_name(0), model_name, opt_level)
247
+
248
+ start_time = time.time()
249
+
250
+ collator = DataCollatorWithPadding(
251
+ tokenizer, padding=True, max_length=512, pad_to_multiple_of=16
252
+ )
253
+
254
+ dl = DataLoader(
255
+ ds,
256
+ batch_size=inference_bs,
257
+ shuffle=False,
258
+ num_workers=2,
259
+ pin_memory=True,
260
+ drop_last=False,
261
+ collate_fn=collator,
262
+ )
263
+
264
+ for batch in dl:
265
+ ids = batch["input_ids"].to(device)
266
+ mask = batch["attention_mask"].to(device)
267
+
268
+ t_ids = torch.zeros_like(ids)
269
+
270
+ outputs = model(input_ids=ids, attention_mask=mask, token_type_ids=t_ids)
271
+
272
+ embeds.extend(mean_pooling(outputs[0], mask).cpu().tolist())
273
+ texts.extend(batch[column_name])
274
+
275
+ current_count += ids.shape[0]
276
+
277
+ # Periodically upload to the hub
278
+ if len(embeds) > upload_batch_size:
279
+ push_to_repo(new_dataset_id, last_count, current_count, embeds, texts, api)
280
+ embeds = []
281
+ texts = []
282
+ last_count = current_count
283
+
284
+ # Provide updates
285
+ if progress is not None:
286
+ progress(
287
+ (current_count, None),
288
+ "Embedding docs...",
289
+ total=None,
290
+ unit="Docs Embedded",
291
+ )
292
+
293
+ time_taken = time.time() - start_time
294
+
295
+ # If there are any remaining embeddings, upload them
296
+ if len(embeds) > 0:
297
+ push_to_repo(new_dataset_id, last_count, current_count, embeds, texts, api)
298
+
299
+ return current_count - num2skip, time_taken
300
+
301
+
302
+ def init_git_repo(repo_id: str):
303
+ """
304
+ Initialize a git repo for the new dataset.
305
+
306
+ ***Removes existing local folder if exists***
307
+
308
+ Args:
309
+ repo_id (`str`):
310
+ id of the new dataset to create. Should include username or organization.
311
+ e.g. nbroad/new-embeddings
312
+ """
313
+ local_dir = repo_id.replace("/", "_")
314
+
315
+ create_repo(
316
+ repo_id,
317
+ repo_type="dataset",
318
+ token=os.environ["HF_TOKEN"],
319
+ private=True,
320
+ exist_ok=True,
321
+ )
322
+ try:
323
+ repo = Repository(
324
+ local_dir=local_dir,
325
+ clone_from=repo_id,
326
+ repo_type="dataset",
327
+ token=os.environ["HF_TOKEN"],
328
+ skip_lfs_files=True,
329
+ )
330
+ except EnvironmentError:
331
+ shutil.rmtree(local_dir)
332
+ repo = Repository(
333
+ local_dir=local_dir,
334
+ clone_from=repo_id,
335
+ repo_type="dataset",
336
+ token=os.environ["HF_TOKEN"],
337
+ skip_lfs_files=True,
338
+ )
339
+
340
+ if repo is not None:
341
+ repo.git_pull()
342
+
343
+ return repo
344
+
345
+
346
+ def push_to_repo(
347
+ repo_id: str,
348
+ last_count: int,
349
+ current_count: int,
350
+ embeds: List[List[float]],
351
+ texts: List[str],
352
+ api: HfApi,
353
+ ):
354
+ """
355
+ Push embeddings to the repo.
356
+
357
+ Args:
358
+ repo_id (`str`):
359
+ id of the new dataset to create. Should include username or organization.
360
+ last_count (`int`):
361
+ last count of embeddings.
362
+ This is the number of embeddings that have already been pushed.
363
+ current_count (`int`):
364
+ current count of embeddings.
365
+ This is the number of embeddings that have been pushed after this batch.
366
+ embeds (`List[List[float]]`):
367
+ list of embeddings to push to the repo
368
+ texts (`List[str]`):
369
+ list of texts to push to the repo
370
+ api (`huggingface_hub.HfApi`):
371
+ api to use to push to the repo
372
+ """
373
+
374
+ temp_ds = Dataset.from_dict(
375
+ {
376
+ "embedding": embeds,
377
+ "text": texts,
378
+ }
379
+ )
380
+
381
+ local_dir = repo_id.replace("/", "_")
382
+
383
+ data_dir = Path(local_dir) / "data"
384
+ data_dir.mkdir(exist_ok=True, parents=True)
385
+
386
+ # use zfill so sorting puts the files in order
387
+ filename = f"embeddings_{str(last_count).zfill(8)}_{current_count}.parquet"
388
+ filepath = str(data_dir / filename)
389
+
390
+ temp_ds.to_parquet(filepath)
391
+
392
+ files = sorted(list(data_dir.glob("*.parquet")))
393
+
394
+ api.upload_file(
395
+ path_or_fileobj=filepath,
396
+ path_in_repo=f"data/{filename}",
397
+ repo_id=repo_id,
398
+ repo_type="dataset",
399
+ run_as_future=True,
400
+ token=os.environ["HF_TOKEN"],
401
+ commit_message=f"Embedded examples {last_count} thru {current_count}",
402
+ )
403
+
404
+ # Delete old files
405
+
406
+ if len(files) > 4:
407
+ for file in files[:2]:
408
+ file.unlink()