fh-new-vm1

Sleeping

victormiller commited on Sep 24, 2024

Commit

d84fec1

verified ·

1 Parent(s): 8115558

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -117,23 +117,26 @@ def main():
         ),
     )
 @app.get("/intro")
 def intro():
     return Div(
         Section(
             H2("Introduction"),
-            P("""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.
-We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:
-- 1. Curates commonly used pretraining datasets, including all CommonCrawl
-- 2. Employs carefully selected filters designed for each data source
-- 3. Provides only unique data elements via globally deduplicated across all datasets
-- 4. Retains all deduplication metadata for custom upweighting
-- 5. Is Production ready! Download here [link to HF repo]
-"""),
             id="section1",
         ),
         Section(

         ),
     )
+intro_text = P(
+"""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
+intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:
+1. Curates commonly used pretraining datasets, including all CommonCrawl
+2. Employs carefully selected filters designed for each data source
+3. Provides only unique data elements via globally deduplicated across all datasets
+4. Retains all deduplication metadata for custom upweighting
+5. Is Production ready! Download here [link to HF repo]
+""")
 @app.get("/intro")
 def intro():
     return Div(
         Section(
             H2("Introduction"),
+            intro_text,
+            intro_list,
             id="section1",
         ),
         Section(