fh-new-vm1

Sleeping

App Files Files Community

victormiller commited on Sep 24, 2024

Commit

b34cbe1

verified ·

1 Parent(s): de513a2

Update main.py

Browse files

Files changed (1) hide show

main.py +157 -0

main.py CHANGED Viewed

@@ -204,6 +204,14 @@ previous_content =  P("""The performance of a large language model (LLM)
                     (listing and explaining all of our design choices),
                     and the process followed to create its 📚
                     FineWeb-Edu subset.""")
 @app.get("/intro")
 def intro():
     return Div(
@@ -237,6 +245,155 @@ def intro():
             P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
             id="section4",
         ),
         id="inner-text",
     )

                     (listing and explaining all of our design choices),
                     and the process followed to create its 📚
                     FineWeb-Edu subset.""")
 @app.get("/intro")
 def intro():
     return Div(
             P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
             id="section4",
         ),
+    dataset_comparison = pd.DataFrame(
+        {
+            "Dataset": [
+                "TxT360",
+                "FineWeb",
+                "RefinedWeb",
+                "RedPajama-v2",
+                "C4",
+                "Dolma",
+                "RedPajama-v1",
+                "The Pile",
+            ],
+            "CommonCrawl": [
+                "99 Snapshots",
+                "96 Snapshots",
+                "90 Snapshots",
+                "84 Snapshots",
+                "1 Snapshots",
+                "24 Snapshots",
+                "5 Snapshots",
+                "0.6% of 74 Snapshots",
+            ],
+            "Papers": [
+                "5 Sources",
+                "-",
+                "-",
+                "-",
+                "-",
+                "1 Source",
+                "1 Source",
+                "4 Sources",
+            ],
+            "Wikipedia": [
+                "Improves data quality by removing irrelevant documents",
+                "Filters out low-quality or incomplete documents",
+                "Provides additional information for analysis",
+                "Enables language-specific analysis and insights",
+                "Helps understand the complexity and content of documents",
+                "Identifies important terms and topics in the dataset",
+                "Quantifies the importance of individual words",
+                "RedPajama-v1",
+            ],
+            "FreeLaw": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "DM Math": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "USPTO": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "PG-19": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "HackerNews": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "Ubuntu IRC": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "EuroParl": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "StackExchange": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+            "Code": [
+                "May exclude documents in less common languages",
+                "May remove documents with valuable information",
+                "May introduce bias in the analysis",
+                "May not accurately represent the language distribution",
+                "May not capture the complexity of document structure",
+                "May be sensitive to noise and outliers",
+                "May not capture the semantic meaning of words",
+                "RedPajama-v1",
+            ],
+        }
+    )
+    table_html = preprocessing_steps.to_html(index=False, border=0)
+    table_div = Div(NotStr(table_html), style="margin: 40px;")
+        Section(
+            H2("Combining the Best of Web and Curated Sources"),
+            H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
+            P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
+            table_div,
+            id="section5",
+        ),
         id="inner-text",
     )