from fasthtml.common import * from fasthtml.components import * from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline from plotly import graph_objects as go from fh_plotly import plotly2fasthtml import pandas as pd import json from rich import print import curated import web import common import results app, rt = fast_app( debug=True, pico=False, hdrs=( Meta(charset="UTF-8"), Meta(name="viewport", content="width=device-width, initial-scale=1.0"), Script(src="https://distill.pub/template.v2.js"), Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"), Script(src="https://cdn.plot.ly/plotly-latest.min.js"), Link(rel="stylesheet", href="style.css"), MarkdownJS(), HighlightJS(langs=["python", "javascript", "html", "css"]), ), ) @app.get("/") def main(): return Div( D_front_matter(), D_title( H1( "TxT360: the most comprehensive, highest quality, and production ready pretraining dataset", cls="l-body", style="text-align: center;", ), Div( Img(src="images/llm360_logo.png"), id="title-plot", cls="main-plot-container l-page", ), ), D_article( D_contents( Nav( H3("Table of Contents"), Div( A("TxT360", href="#_self"), hx_get="/intro", hx_target="#inner-text", ), Div( Ul( Li( A( "Introduction", href="/intro#section1", hx_get="/intro#section1", hx_target="#inner-text", ) ), Li( A( "Background", href="/intro#section2", hx_get="/intro#section2", hx_target="#inner-text", ) ), Li( A( "Main Content", href="/intro#section3", hx_get="/intro#section3", hx_target="#inner-text", ) ), Li( A( "Conclusion", href="/intro#section4", hx_get="/intro#section4", hx_target="#inner-text", ) ), ), ), Div( A("Web Data", href="#inner-text"), hx_get="/webdata", hx_target="#inner-text", ), Div( A("Curated Sources", href="#inner-text"), hx_get="/curated", hx_target="#inner-text", ), Div( A("Common Steps", href="#inner-text"), hx_get="/common", hx_target="#inner-text", ), Div( A("TxT360 Results", href="#inner-text"), hx_get="/results", hx_target="#inner-text", ), role="navigation", cls="l-text figcaption", ), ), intro(), ), ) intro_text = P( """Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""") intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""") intro_1 = P("1. Curates commonly used pretraining datasets, including all CommonCrawl") intro_2 = P("2. Employs carefully selected filters designed for each data source") intro_3 = P("3. Provides only unique data elements via globally deduplicated across all datasets") intro_4 = P("4. Retains all deduplication metadata for custom upweighting") intro_5 = P("5. Is Production ready! Download here [link to HF repo]") previous_background = P( """ The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). The community has introduced a variety of datasets for this purpose, including purely web-based datasets like RefinedWeb [1], RedPajama-Data-V2 [2], DCLM [3], and FineWeb [4], as well as comprehensive datasets derived from multiple highly-curated data sources such as The Pile [5], RedPajama-Data-V1 [6], and Dolma [7] . It is commonly known that web-based datasets provide a vast quantity of data, while highly-curated multi-source datasets consistently deliver high quality and diversity, both critical for effective LLM pre-training. However, despite the advancements in both types of data, each type of dataset has its limitations. For instance, the processing scripts for the web dataset, RefinedWeb, known for its high quality, are not public, and only about 10% of the entire dataset has been disclosed. Conversely, the web component of existing highly-curated multi-source datasets is relatively small compared to purely web-based datasets, limiting their coverage and diversity compared to the scale of information from the internet. By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training. """ ) previous_content = P("""The performance of a large language model (LLM) depends heavily on the quality and size of its pretraining dataset. However, the pretraining datasets for state-of-the-art open LLMs like Llama 3 and Mixtral are not publicly available and very little is known about how they were created. Reading time: 45 min. For the best reading experience, we recommend not using a mobile phone. Recently, we released 🍷 FineWeb, a new, large-scale (15-trillion tokens, 44TB disk space) dataset for LLM pretraining. FineWeb is derived from 96 CommonCrawl snapshots and produces better-performing LLMs than other open pretraining datasets. To bring more clarity in machine learning and advance the open understanding of how to train good quality large language models, we carefully documented and ablated all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies. The present long form report is a deep dive in how to create a large and high-quality web-scale dataset for LLM pretraining. The dataset itself, 🍷 FineWeb, is available here. We are extremely thankful to the whole distill.pub team (Christopher Olah, Shan Carter, Ludwig Schubert in particular) for creating the template on which we based this blog post. Thanks also for inspiring us with exquisitely crafted articles and blog posts. In this report we also introduce 📚 FineWeb-Edu, a subset of FineWeb constructed using scalable automated high-quality annotations for educational value, and which outperforms all openly accessible web-datasets on a number of educational benchmarks such as MMLU, ARC, and OpenBookQA. 📚 FineWeb-Edu is available in two sizes/filtering-level: 1.3 trillion (very high educational content) and 5.4 trillion (high educational content) tokens (all tokens are measured with GPT2 tokenizer). You can download it here. Both datasets are released under the permissive ODC-By 1.0 license TLDR: This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb recipe (listing and explaining all of our design choices), and the process followed to create its 📚 FineWeb-Edu subset.""") @app.get("/intro") def intro(): return Div( Section( H2("Introduction"), intro_text, intro_list, intro_1, intro_2, intro_3, intro_4, intro_5, id="section1", ), Section( H3("Global Deduplication"), P("TxT360 curated a wide range of datasets, including a whopping 99 Common Crawl Dumps and a list of high quality datasets: StackExchange, Wikipedia, Arxiv, USPTO, DM Math, HackerNews, Ubuntu IRC, Europarl, FreeLaw, PG19, S2ORC, PhilPapers, PubMed Abstracts, and PubMed Central. For the first time in a released dataset, we locally and globally deduplicated the data across each dataset creating the highest quality data available."), id="section2", ), Section( H3("Controllable Upweighting for Flexible Data Sample Weight Control"), P("In large-scale corpora like CommonCrawl, text duplication is a frequent occurrence. Duplication can be considered as a natural upsampling of some data points. Recent studies have highlighted the potential drawbacks of oversampling specific data points, which can negatively impact pretraining performance [2205.10487]. However, when samples are repeated appropriately, the performance can actually improve [2306.01116, 2305.16264, 2406.11794, FineWeb]. Despite this, there is currently no widely accepted best practice for data sampling, and it’s unlikely that a one-size-fits-all approach will emerge given the scale of these datasets. Previous work either leaves the deduplication process to the user (as seen in RedPajama V2 and DCLM-Pool) or provides a corpus that has been downsampled in a specific manner (such as in FineWeb and RefinedWeb)."), P("Given the high cost of deduplication, TxT360 offers a complete deduplication across all datasets (so you don’t have to). Additionally, TxT360 maintains detailed metadata for each sample, including the frequency and location of duplicates. This metadata gives pretrainers the flexibility to adjust the weight of samples as needed. In principle, one can recover the original dataset distribution (footnote: this approach also means a smaller size on disk). We will demonstrate a simple upsampling strategy that results in an effective pretraining dataset. "), id="section3", ), Section( H3("Full and Openly Documented Production Ready Pretraining Corpus"), P("We cover every aspect of the decisions made to produce the dataset, including document selection, filtering, quality assurance, deduplication, standardization and PII. Our reasoning is thoroughly explained, ensuring transparency and replicability. "), P("Our code is open sourced here[link to github]."), P("The dataset is ready for immediate download directly from Hugging Face [link]."), P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"), id="section4", ), dataset_comparison = pd.DataFrame( { "Dataset": [ "TxT360", "FineWeb", "RefinedWeb", "RedPajama-v2", "C4", "Dolma", "RedPajama-v1", "The Pile", ], "CommonCrawl": [ "99 Snapshots", "96 Snapshots", "90 Snapshots", "84 Snapshots", "1 Snapshots", "24 Snapshots", "5 Snapshots", "0.6% of 74 Snapshots", ], "Papers": [ "5 Sources", "-", "-", "-", "-", "1 Source", "1 Source", "4 Sources", ], "Wikipedia": [ "Improves data quality by removing irrelevant documents", "Filters out low-quality or incomplete documents", "Provides additional information for analysis", "Enables language-specific analysis and insights", "Helps understand the complexity and content of documents", "Identifies important terms and topics in the dataset", "Quantifies the importance of individual words", "RedPajama-v1", ], "FreeLaw": [ "May exclude documents in less common languages", "May remove documents with valuable information", "May introduce bias in the analysis", "May not accurately represent the language distribution", "May not capture the complexity of document structure", "May be sensitive to noise and outliers", "May not capture the semantic meaning of words", "RedPajama-v1", ], "DM Math": [ "May exclude documents in less common languages", "May remove documents with valuable information", "May introduce bias in the analysis", "May not accurately represent the language distribution", "May not capture the complexity of document structure", "May be sensitive to noise and outliers", "May not capture the semantic meaning of words", "RedPajama-v1", ], "USPTO": [ "May exclude documents in less common languages", "May remove documents with valuable information", "May introduce bias in the analysis", "May not accurately represent the language distribution", "May not capture the complexity of document structure", "May be sensitive to noise and outliers", "May not capture the semantic meaning of words", "RedPajama-v1", ], "PG-19": [ "May exclude documents in less common languages", "May remove documents with valuable information", "May introduce bias in the analysis", "May not accurately represent the language distribution", "May not capture the complexity of document structure", "May be sensitive to noise and outliers", "May not capture the semantic meaning of words", "RedPajama-v1", ], "HackerNews": [ "May exclude documents in less common languages", "May remove documents with valuable information", "May introduce bias in the analysis", "May not accurately represent the language distribution", "May not capture the complexity of document structure", "May be sensitive to noise and outliers", "May not capture the semantic meaning of words", "RedPajama-v1", ], "Ubuntu IRC": [ "May exclude documents in less common languages", "May remove documents with valuable information", "May introduce bias in the analysis", "May not accurately represent the language distribution", "May not capture the complexity of document structure", "May be sensitive to noise and outliers", "May not capture the semantic meaning of words", "RedPajama-v1", ], "EuroParl": [ "May exclude documents in less common languages", "May remove documents with valuable information", "May introduce bias in the analysis", "May not accurately represent the language distribution", "May not capture the complexity of document structure", "May be sensitive to noise and outliers", "May not capture the semantic meaning of words", "RedPajama-v1", ], "StackExchange": [ "May exclude documents in less common languages", "May remove documents with valuable information", "May introduce bias in the analysis", "May not accurately represent the language distribution", "May not capture the complexity of document structure", "May be sensitive to noise and outliers", "May not capture the semantic meaning of words", "RedPajama-v1", ], "Code": [ "May exclude documents in less common languages", "May remove documents with valuable information", "May introduce bias in the analysis", "May not accurately represent the language distribution", "May not capture the complexity of document structure", "May be sensitive to noise and outliers", "May not capture the semantic meaning of words", "RedPajama-v1", ], } ) table_html = preprocessing_steps.to_html(index=False, border=0) table_div = Div(NotStr(table_html), style="margin: 40px;") Section( H2("Combining the Best of Web and Curated Sources"), H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"), P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."), table_div, id="section5", ), id="inner-text", ) rt("/curated")(curated.curated) rt("/webdata")(web.web_data) rt("/common")(common.common_steps) rt("/results")(results.results) serve()