victormiller commited on
Commit
2477fa9
·
verified ·
1 Parent(s): 27361f1

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +32 -30
main.py CHANGED
@@ -128,25 +128,7 @@ intro_3 = P("3. Provides only unique data elements via globally deduplicated acr
128
  intro_4 = P("4. Retains all deduplication metadata for custom upweighting")
129
  intro_5 = P("5. Is Production ready! Download here [link to HF repo]")
130
 
131
-
132
-
133
- @app.get("/intro")
134
- def intro():
135
- return Div(
136
- Section(
137
- H2("Introduction"),
138
- intro_text,
139
- intro_list,
140
- intro_1,
141
- intro_2,
142
- intro_3,
143
- intro_4,
144
- intro_5,
145
- id="section1",
146
- ),
147
- Section(
148
- H2("Background"),
149
- P(
150
  """ The quality and size of a pre-training dataset
151
  play a crucial role in the performance of large
152
  language models (LLMs). The community has
@@ -176,12 +158,8 @@ def intro():
176
  sources, TxT360 is crafted to meet and surpass the
177
  rigorous standards required for state-of-the-art
178
  LLM pre-training. """
179
- ),
180
- id="section2",
181
- ),
182
- Section(
183
- H2("Main Content"),
184
- P("""The performance of a large language model (LLM)
185
  depends heavily on the quality and size of its
186
  pretraining dataset. However, the pretraining
187
  datasets for state-of-the-art open LLMs like Llama
@@ -225,14 +203,38 @@ def intro():
225
  data quality at scale, the 🍷 FineWeb recipe
226
  (listing and explaining all of our design choices),
227
  and the process followed to create its 📚
228
- FineWeb-Edu subset."""),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  id="section3",
230
  ),
231
  Section(
232
- H2("Conclusion"),
233
- P("""This is the conclusion section where we
234
- summarize the key points discussed in the blog post
235
- and provide final thoughts."""),
 
236
  id="section4",
237
  ),
238
  id="inner-text",
 
128
  intro_4 = P("4. Retains all deduplication metadata for custom upweighting")
129
  intro_5 = P("5. Is Production ready! Download here [link to HF repo]")
130
 
131
+ previous_background = P(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  """ The quality and size of a pre-training dataset
133
  play a crucial role in the performance of large
134
  language models (LLMs). The community has
 
158
  sources, TxT360 is crafted to meet and surpass the
159
  rigorous standards required for state-of-the-art
160
  LLM pre-training. """
161
+ )
162
+ previous_content = P("""The performance of a large language model (LLM)
 
 
 
 
163
  depends heavily on the quality and size of its
164
  pretraining dataset. However, the pretraining
165
  datasets for state-of-the-art open LLMs like Llama
 
203
  data quality at scale, the 🍷 FineWeb recipe
204
  (listing and explaining all of our design choices),
205
  and the process followed to create its 📚
206
+ FineWeb-Edu subset.""")
207
+ @app.get("/intro")
208
+ def intro():
209
+ return Div(
210
+ Section(
211
+ H2("Introduction"),
212
+ intro_text,
213
+ intro_list,
214
+ intro_1,
215
+ intro_2,
216
+ intro_3,
217
+ intro_4,
218
+ intro_5,
219
+ id="section1",
220
+ ),
221
+ Section(
222
+ H3("Global Deduplication"),
223
+ P("TxT360 curated a wide range of datasets, including a whopping 99 Common Crawl Dumps and a list of high quality datasets: StackExchange, Wikipedia, Arxiv, USPTO, DM Math, HackerNews, Ubuntu IRC, Europarl, FreeLaw, PG19, S2ORC, PhilPapers, PubMed Abstracts, and PubMed Central. For the first time in a released dataset, we locally and globally deduplicated the data across each dataset creating the highest quality data available.")
224
+ id="section2",
225
+ ),
226
+ Section(
227
+ H2("Main Content"),
228
+ P("In large-scale corpora like CommonCrawl, text duplication is a frequent occurrence. Duplication can be considered as a natural upsampling of some data points. Recent studies have highlighted the potential drawbacks of oversampling specific data points, which can negatively impact pretraining performance [2205.10487]. However, when samples are repeated appropriately, the performance can actually improve [2306.01116, 2305.16264, 2406.11794, FineWeb]. Despite this, there is currently no widely accepted best practice for data sampling, and it’s unlikely that a one-size-fits-all approach will emerge given the scale of these datasets. Previous work either leaves the deduplication process to the user (as seen in RedPajama V2 and DCLM-Pool) or provides a corpus that has been downsampled in a specific manner (such as in FineWeb and RefinedWeb).")
229
+ P("Given the high cost of deduplication, TxT360 offers a complete deduplication across all datasets (so you don’t have to). Additionally, TxT360 maintains detailed metadata for each sample, including the frequency and location of duplicates. This metadata gives pretrainers the flexibility to adjust the weight of samples as needed. In principle, one can recover the original dataset distribution (footnote: this approach also means a smaller size on disk). We will demonstrate a simple upsampling strategy that results in an effective pretraining dataset. ")
230
  id="section3",
231
  ),
232
  Section(
233
+ H2("Full and Openly Documented Production Ready Pretraining Corpus"),
234
+ P("We cover every aspect of the decisions made to produce the dataset, including document selection, filtering, quality assurance, deduplication, standardization and PII. Our reasoning is thoroughly explained, ensuring transparency and replicability. "),
235
+ P("Our code is open sourced here[link to github]."),
236
+ P("The dataset is ready for immediate download directly from Hugging Face [link]."),
237
+ P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
238
  id="section4",
239
  ),
240
  id="inner-text",