victormiller commited on
Commit
b34cbe1
·
verified ·
1 Parent(s): de513a2

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +157 -0
main.py CHANGED
@@ -204,6 +204,14 @@ previous_content = P("""The performance of a large language model (LLM)
204
  (listing and explaining all of our design choices),
205
  and the process followed to create its 📚
206
  FineWeb-Edu subset.""")
 
 
 
 
 
 
 
 
207
  @app.get("/intro")
208
  def intro():
209
  return Div(
@@ -237,6 +245,155 @@ def intro():
237
  P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
238
  id="section4",
239
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  id="inner-text",
241
  )
242
 
 
204
  (listing and explaining all of our design choices),
205
  and the process followed to create its 📚
206
  FineWeb-Edu subset.""")
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
  @app.get("/intro")
216
  def intro():
217
  return Div(
 
245
  P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
246
  id="section4",
247
  ),
248
+
249
+ dataset_comparison = pd.DataFrame(
250
+ {
251
+ "Dataset": [
252
+ "TxT360",
253
+ "FineWeb",
254
+ "RefinedWeb",
255
+ "RedPajama-v2",
256
+ "C4",
257
+ "Dolma",
258
+ "RedPajama-v1",
259
+ "The Pile",
260
+ ],
261
+ "CommonCrawl": [
262
+ "99 Snapshots",
263
+ "96 Snapshots",
264
+ "90 Snapshots",
265
+ "84 Snapshots",
266
+ "1 Snapshots",
267
+ "24 Snapshots",
268
+ "5 Snapshots",
269
+ "0.6% of 74 Snapshots",
270
+ ],
271
+ "Papers": [
272
+ "5 Sources",
273
+ "-",
274
+ "-",
275
+ "-",
276
+ "-",
277
+ "1 Source",
278
+ "1 Source",
279
+ "4 Sources",
280
+ ],
281
+ "Wikipedia": [
282
+ "Improves data quality by removing irrelevant documents",
283
+ "Filters out low-quality or incomplete documents",
284
+ "Provides additional information for analysis",
285
+ "Enables language-specific analysis and insights",
286
+ "Helps understand the complexity and content of documents",
287
+ "Identifies important terms and topics in the dataset",
288
+ "Quantifies the importance of individual words",
289
+ "RedPajama-v1",
290
+ ],
291
+ "FreeLaw": [
292
+ "May exclude documents in less common languages",
293
+ "May remove documents with valuable information",
294
+ "May introduce bias in the analysis",
295
+ "May not accurately represent the language distribution",
296
+ "May not capture the complexity of document structure",
297
+ "May be sensitive to noise and outliers",
298
+ "May not capture the semantic meaning of words",
299
+ "RedPajama-v1",
300
+ ],
301
+ "DM Math": [
302
+ "May exclude documents in less common languages",
303
+ "May remove documents with valuable information",
304
+ "May introduce bias in the analysis",
305
+ "May not accurately represent the language distribution",
306
+ "May not capture the complexity of document structure",
307
+ "May be sensitive to noise and outliers",
308
+ "May not capture the semantic meaning of words",
309
+ "RedPajama-v1",
310
+ ],
311
+ "USPTO": [
312
+ "May exclude documents in less common languages",
313
+ "May remove documents with valuable information",
314
+ "May introduce bias in the analysis",
315
+ "May not accurately represent the language distribution",
316
+ "May not capture the complexity of document structure",
317
+ "May be sensitive to noise and outliers",
318
+ "May not capture the semantic meaning of words",
319
+ "RedPajama-v1",
320
+ ],
321
+ "PG-19": [
322
+ "May exclude documents in less common languages",
323
+ "May remove documents with valuable information",
324
+ "May introduce bias in the analysis",
325
+ "May not accurately represent the language distribution",
326
+ "May not capture the complexity of document structure",
327
+ "May be sensitive to noise and outliers",
328
+ "May not capture the semantic meaning of words",
329
+ "RedPajama-v1",
330
+ ],
331
+ "HackerNews": [
332
+ "May exclude documents in less common languages",
333
+ "May remove documents with valuable information",
334
+ "May introduce bias in the analysis",
335
+ "May not accurately represent the language distribution",
336
+ "May not capture the complexity of document structure",
337
+ "May be sensitive to noise and outliers",
338
+ "May not capture the semantic meaning of words",
339
+ "RedPajama-v1",
340
+ ],
341
+ "Ubuntu IRC": [
342
+ "May exclude documents in less common languages",
343
+ "May remove documents with valuable information",
344
+ "May introduce bias in the analysis",
345
+ "May not accurately represent the language distribution",
346
+ "May not capture the complexity of document structure",
347
+ "May be sensitive to noise and outliers",
348
+ "May not capture the semantic meaning of words",
349
+ "RedPajama-v1",
350
+ ],
351
+ "EuroParl": [
352
+ "May exclude documents in less common languages",
353
+ "May remove documents with valuable information",
354
+ "May introduce bias in the analysis",
355
+ "May not accurately represent the language distribution",
356
+ "May not capture the complexity of document structure",
357
+ "May be sensitive to noise and outliers",
358
+ "May not capture the semantic meaning of words",
359
+ "RedPajama-v1",
360
+ ],
361
+ "StackExchange": [
362
+ "May exclude documents in less common languages",
363
+ "May remove documents with valuable information",
364
+ "May introduce bias in the analysis",
365
+ "May not accurately represent the language distribution",
366
+ "May not capture the complexity of document structure",
367
+ "May be sensitive to noise and outliers",
368
+ "May not capture the semantic meaning of words",
369
+ "RedPajama-v1",
370
+ ],
371
+ "Code": [
372
+ "May exclude documents in less common languages",
373
+ "May remove documents with valuable information",
374
+ "May introduce bias in the analysis",
375
+ "May not accurately represent the language distribution",
376
+ "May not capture the complexity of document structure",
377
+ "May be sensitive to noise and outliers",
378
+ "May not capture the semantic meaning of words",
379
+ "RedPajama-v1",
380
+ ],
381
+ }
382
+ )
383
+
384
+ table_html = preprocessing_steps.to_html(index=False, border=0)
385
+ table_div = Div(NotStr(table_html), style="margin: 40px;")
386
+
387
+
388
+
389
+
390
+ Section(
391
+ H2("Combining the Best of Web and Curated Sources"),
392
+ H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
393
+ P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
394
+ table_div,
395
+ id="section5",
396
+ ),
397
  id="inner-text",
398
  )
399