victormiller commited on
Commit
d84fec1
·
verified ·
1 Parent(s): 8115558

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +14 -11
main.py CHANGED
@@ -117,23 +117,26 @@ def main():
117
  ),
118
  )
119
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  @app.get("/intro")
122
  def intro():
123
  return Div(
124
  Section(
125
  H2("Introduction"),
126
- P("""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.
127
-
128
- We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:
129
-
130
-
131
- - 1. Curates commonly used pretraining datasets, including all CommonCrawl
132
- - 2. Employs carefully selected filters designed for each data source
133
- - 3. Provides only unique data elements via globally deduplicated across all datasets
134
- - 4. Retains all deduplication metadata for custom upweighting
135
- - 5. Is Production ready! Download here [link to HF repo]
136
- """),
137
  id="section1",
138
  ),
139
  Section(
 
117
  ),
118
  )
119
 
120
+ intro_text = P(
121
+ """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
122
+
123
+ intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:
124
+
125
+ 1. Curates commonly used pretraining datasets, including all CommonCrawl
126
+ 2. Employs carefully selected filters designed for each data source
127
+ 3. Provides only unique data elements via globally deduplicated across all datasets
128
+ 4. Retains all deduplication metadata for custom upweighting
129
+ 5. Is Production ready! Download here [link to HF repo]
130
+ """)
131
+
132
 
133
  @app.get("/intro")
134
  def intro():
135
  return Div(
136
  Section(
137
  H2("Introduction"),
138
+ intro_text,
139
+ intro_list,
 
 
 
 
 
 
 
 
 
140
  id="section1",
141
  ),
142
  Section(