victormiller commited on
Commit
f593237
·
verified ·
1 Parent(s): b34cbe1

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +38 -44
main.py CHANGED
@@ -205,48 +205,7 @@ previous_content = P("""The performance of a large language model (LLM)
205
  and the process followed to create its 📚
206
  FineWeb-Edu subset.""")
207
 
208
-
209
-
210
-
211
-
212
-
213
-
214
-
215
- @app.get("/intro")
216
- def intro():
217
- return Div(
218
- Section(
219
- H2("Introduction"),
220
- intro_text,
221
- intro_list,
222
- intro_1,
223
- intro_2,
224
- intro_3,
225
- intro_4,
226
- intro_5,
227
- id="section1",
228
- ),
229
- Section(
230
- H3("Global Deduplication"),
231
- P("TxT360 curated a wide range of datasets, including a whopping 99 Common Crawl Dumps and a list of high quality datasets: StackExchange, Wikipedia, Arxiv, USPTO, DM Math, HackerNews, Ubuntu IRC, Europarl, FreeLaw, PG19, S2ORC, PhilPapers, PubMed Abstracts, and PubMed Central. For the first time in a released dataset, we locally and globally deduplicated the data across each dataset creating the highest quality data available."),
232
- id="section2",
233
- ),
234
- Section(
235
- H3("Controllable Upweighting for Flexible Data Sample Weight Control"),
236
- P("In large-scale corpora like CommonCrawl, text duplication is a frequent occurrence. Duplication can be considered as a natural upsampling of some data points. Recent studies have highlighted the potential drawbacks of oversampling specific data points, which can negatively impact pretraining performance [2205.10487]. However, when samples are repeated appropriately, the performance can actually improve [2306.01116, 2305.16264, 2406.11794, FineWeb]. Despite this, there is currently no widely accepted best practice for data sampling, and it’s unlikely that a one-size-fits-all approach will emerge given the scale of these datasets. Previous work either leaves the deduplication process to the user (as seen in RedPajama V2 and DCLM-Pool) or provides a corpus that has been downsampled in a specific manner (such as in FineWeb and RefinedWeb)."),
237
- P("Given the high cost of deduplication, TxT360 offers a complete deduplication across all datasets (so you don’t have to). Additionally, TxT360 maintains detailed metadata for each sample, including the frequency and location of duplicates. This metadata gives pretrainers the flexibility to adjust the weight of samples as needed. In principle, one can recover the original dataset distribution (footnote: this approach also means a smaller size on disk). We will demonstrate a simple upsampling strategy that results in an effective pretraining dataset. "),
238
- id="section3",
239
- ),
240
- Section(
241
- H3("Full and Openly Documented Production Ready Pretraining Corpus"),
242
- P("We cover every aspect of the decisions made to produce the dataset, including document selection, filtering, quality assurance, deduplication, standardization and PII. Our reasoning is thoroughly explained, ensuring transparency and replicability. "),
243
- P("Our code is open sourced here[link to github]."),
244
- P("The dataset is ready for immediate download directly from Hugging Face [link]."),
245
- P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
246
- id="section4",
247
- ),
248
-
249
- dataset_comparison = pd.DataFrame(
250
  {
251
  "Dataset": [
252
  "TxT360",
@@ -383,10 +342,45 @@ def intro():
383
 
384
  table_html = preprocessing_steps.to_html(index=False, border=0)
385
  table_div = Div(NotStr(table_html), style="margin: 40px;")
386
-
387
 
388
 
389
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  Section(
391
  H2("Combining the Best of Web and Curated Sources"),
392
  H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
 
205
  and the process followed to create its 📚
206
  FineWeb-Edu subset.""")
207
 
208
+ dataset_comparison = pd.DataFrame(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  {
210
  "Dataset": [
211
  "TxT360",
 
342
 
343
  table_html = preprocessing_steps.to_html(index=False, border=0)
344
  table_div = Div(NotStr(table_html), style="margin: 40px;")
 
345
 
346
 
347
+
348
+
349
+
350
+
351
+ @app.get("/intro")
352
+ def intro():
353
+ return Div(
354
+ Section(
355
+ H2("Introduction"),
356
+ intro_text,
357
+ intro_list,
358
+ intro_1,
359
+ intro_2,
360
+ intro_3,
361
+ intro_4,
362
+ intro_5,
363
+ id="section1",
364
+ ),
365
+ Section(
366
+ H3("Global Deduplication"),
367
+ P("TxT360 curated a wide range of datasets, including a whopping 99 Common Crawl Dumps and a list of high quality datasets: StackExchange, Wikipedia, Arxiv, USPTO, DM Math, HackerNews, Ubuntu IRC, Europarl, FreeLaw, PG19, S2ORC, PhilPapers, PubMed Abstracts, and PubMed Central. For the first time in a released dataset, we locally and globally deduplicated the data across each dataset creating the highest quality data available."),
368
+ id="section2",
369
+ ),
370
+ Section(
371
+ H3("Controllable Upweighting for Flexible Data Sample Weight Control"),
372
+ P("In large-scale corpora like CommonCrawl, text duplication is a frequent occurrence. Duplication can be considered as a natural upsampling of some data points. Recent studies have highlighted the potential drawbacks of oversampling specific data points, which can negatively impact pretraining performance [2205.10487]. However, when samples are repeated appropriately, the performance can actually improve [2306.01116, 2305.16264, 2406.11794, FineWeb]. Despite this, there is currently no widely accepted best practice for data sampling, and it’s unlikely that a one-size-fits-all approach will emerge given the scale of these datasets. Previous work either leaves the deduplication process to the user (as seen in RedPajama V2 and DCLM-Pool) or provides a corpus that has been downsampled in a specific manner (such as in FineWeb and RefinedWeb)."),
373
+ P("Given the high cost of deduplication, TxT360 offers a complete deduplication across all datasets (so you don’t have to). Additionally, TxT360 maintains detailed metadata for each sample, including the frequency and location of duplicates. This metadata gives pretrainers the flexibility to adjust the weight of samples as needed. In principle, one can recover the original dataset distribution (footnote: this approach also means a smaller size on disk). We will demonstrate a simple upsampling strategy that results in an effective pretraining dataset. "),
374
+ id="section3",
375
+ ),
376
+ Section(
377
+ H3("Full and Openly Documented Production Ready Pretraining Corpus"),
378
+ P("We cover every aspect of the decisions made to produce the dataset, including document selection, filtering, quality assurance, deduplication, standardization and PII. Our reasoning is thoroughly explained, ensuring transparency and replicability. "),
379
+ P("Our code is open sourced here[link to github]."),
380
+ P("The dataset is ready for immediate download directly from Hugging Face [link]."),
381
+ P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
382
+ id="section4",
383
+ ),
384
  Section(
385
  H2("Combining the Best of Web and Curated Sources"),
386
  H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),