Spaces:
Sleeping
Sleeping
victormiller
commited on
Update main.py
Browse files
main.py
CHANGED
@@ -117,23 +117,26 @@ def main():
|
|
117 |
),
|
118 |
)
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
@app.get("/intro")
|
122 |
def intro():
|
123 |
return Div(
|
124 |
Section(
|
125 |
H2("Introduction"),
|
126 |
-
|
127 |
-
|
128 |
-
We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:
|
129 |
-
|
130 |
-
|
131 |
-
- 1. Curates commonly used pretraining datasets, including all CommonCrawl
|
132 |
-
- 2. Employs carefully selected filters designed for each data source
|
133 |
-
- 3. Provides only unique data elements via globally deduplicated across all datasets
|
134 |
-
- 4. Retains all deduplication metadata for custom upweighting
|
135 |
-
- 5. Is Production ready! Download here [link to HF repo]
|
136 |
-
"""),
|
137 |
id="section1",
|
138 |
),
|
139 |
Section(
|
|
|
117 |
),
|
118 |
)
|
119 |
|
120 |
+
intro_text = P(
|
121 |
+
"""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
|
122 |
+
|
123 |
+
intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:
|
124 |
+
|
125 |
+
1. Curates commonly used pretraining datasets, including all CommonCrawl
|
126 |
+
2. Employs carefully selected filters designed for each data source
|
127 |
+
3. Provides only unique data elements via globally deduplicated across all datasets
|
128 |
+
4. Retains all deduplication metadata for custom upweighting
|
129 |
+
5. Is Production ready! Download here [link to HF repo]
|
130 |
+
""")
|
131 |
+
|
132 |
|
133 |
@app.get("/intro")
|
134 |
def intro():
|
135 |
return Div(
|
136 |
Section(
|
137 |
H2("Introduction"),
|
138 |
+
intro_text,
|
139 |
+
intro_list,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
id="section1",
|
141 |
),
|
142 |
Section(
|