Spaces:
Running
Running
File size: 11,387 Bytes
739cf2e 08fabf7 739cf2e 08fabf7 739cf2e 08fabf7 739cf2e eb008d8 739cf2e 08fabf7 eb008d8 739cf2e 907b541 739cf2e 907b541 739cf2e 907b541 739cf2e eb008d8 739cf2e 08fabf7 907b541 739cf2e eb008d8 907b541 739cf2e 907b541 739cf2e 907b541 739cf2e 4ee5487 739cf2e 907b541 739cf2e f6712d8 21cb44b f6712d8 08fabf7 f6712d8 08fabf7 21cb44b 08fabf7 f6712d8 08fabf7 f6712d8 08fabf7 f6712d8 08fabf7 f6712d8 b717308 739cf2e 907b541 739cf2e f6712d8 739cf2e f79ba24 b717308 08fabf7 96dc279 08fabf7 96dc279 08fabf7 b717308 739cf2e eb008d8 739cf2e b717308 907b541 b717308 907b541 e7fb31c f79ba24 b717308 907b541 e7fb31c 907b541 739cf2e 907b541 96dc279 907b541 96dc279 907b541 739cf2e 907b541 739cf2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 |
import logging
from pathlib import Path
import gradio as gr
import pandas as pd
from datasets import Dataset
from gradio_log import Log
from huggingface_hub import DatasetCard
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core.schema import MetadataMode
from llama_index.readers.docling import DoclingReader
from llama_index.readers.file import (
EpubReader,
HWPReader,
ImageReader,
IPYNBReader,
MboxReader,
PandasCSVReader,
PandasExcelReader,
VideoAudioReader,
)
from tqdm.auto import tqdm
log_file = "logs.txt"
Path(log_file).touch(exist_ok=True)
logging.basicConfig(filename="logs.txt", level=logging.INFO)
logging.getLogger().addHandler(logging.FileHandler(log_file))
def load_corpus(
files, chunk_size=256, chunk_overlap=0, verbose=True, split_sentences=True
):
if verbose:
gr.Info("Loading files...")
docling_reader = DoclingReader()
try:
docs = []
for file in files:
docs.extend(docling_reader.load_data(file))
except Exception:
reader = SimpleDirectoryReader(
input_files=files,
file_extractor={
".hwp": HWPReader,
".pdf": docling_reader,
".docx": docling_reader,
".pptx": docling_reader,
".ppt": docling_reader,
".pptm": docling_reader,
".gif": ImageReader,
".jpg": ImageReader,
".png": ImageReader,
".jpeg": ImageReader,
".webp": ImageReader,
".mp3": VideoAudioReader,
".mp4": VideoAudioReader,
".csv": PandasCSVReader,
".epub": EpubReader,
".md": docling_reader,
".mbox": MboxReader,
".ipynb": IPYNBReader,
".xls": PandasExcelReader,
".xlsx": PandasExcelReader,
},
)
docs = reader.load_data()
if split_sentences is False:
gr.Info(
"Skipping sentence splitting. Each file will be a single row in the dataset."
)
return {doc.id_: doc.text for doc in docs}
if split_sentences:
return split_corpus(verbose, docs, chunk_size, chunk_overlap)
def split_corpus(verbose, docs, chunk_size, chunk_overlap):
if verbose:
gr.Info(f"Loaded {len(docs)} docs")
parser = SentenceSplitter.from_defaults(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)
if verbose:
gr.Info(f"Parsed {len(nodes)} nodes")
docs = {
node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
for node in tqdm(nodes)
}
# remove empty docs
docs = {k: v for k, v in docs.items() if v}
return docs
def upload_and_preview(
files,
chunk_size: int = 256,
chunk_overlap: int = 0,
split_sentences: bool = True,
):
print("loading files")
if isinstance(files, pd.DataFrame):
file_paths = files["urls"].tolist()
else:
file_paths = [file.name for file in files]
print("parsing into sentences")
corpus = load_corpus(
file_paths,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
split_sentences=split_sentences,
)
gr.Info("Creating dataset")
dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
message = f"Files uploaded and dataset preview created:\n - {len(dataset)} rows"
state = {
"file_paths": file_paths,
"dataset": dataset,
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap,
}
return state, dataset.to_pandas(), message
def preview_dataset(
state,
chunk_size: int = 256,
chunk_overlap: int = 0,
split_sentences: bool = True,
):
if not state.get("file_paths"):
raise gr.Error("Please upload files first.")
print("parsing into sentences")
corpus = load_corpus(
state["file_paths"],
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
split_sentences=split_sentences,
)
print("Creating dataset")
dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
message = f"Dataset preview updated:\n - {len(dataset)} rows"
state["dataset"] = dataset
state["chunk_size"] = chunk_size
state["chunk_overlap"] = chunk_overlap
return state, dataset.to_pandas(), message
def upload_to_hub(
state,
hub_id: str = None,
private: bool = False,
oauth_token: gr.OAuthToken = None,
):
if not state.get("dataset"):
raise gr.Error("Please preview the dataset first.")
dataset = state["dataset"]
chunk_size = state["chunk_size"]
chunk_overlap = state["chunk_overlap"]
message = f"Dataset has: \n - {len(dataset)} rows"
if hub_id:
if oauth_token is not None:
gr.Info("Uploading dataset to the Hugging Face Hub...")
dataset.push_to_hub(hub_id, token=oauth_token.token, private=private)
update_dataset_card(hub_id, oauth_token.token, chunk_size, chunk_overlap)
message += (
f"\n\nUploaded to [{hub_id}](https://huggingface.co/datasets/{hub_id})"
)
else:
raise gr.Error("Please login to Hugging Face Hub to push to hub")
return message
def update_dataset_card(
hub_id,
token,
chunk_size,
chunk_overlap,
):
card = DatasetCard.load(hub_id, token=token)
if not card.text:
# add template description to card text
card.text += f"""This dataset was created using [Corpus Creator](https://huggingface.co/spaces/davanstrien/corpus-creator). This dataset was created by parsing a corpus of text files into chunks of sentences using Llama Index.
This processing was done with a chunk size of {chunk_size} and a chunk overlap of {chunk_overlap}."""
tags = card.data.get("tags", [])
tags.append("corpus-creator")
card.data["tags"] = tags
card.push_to_hub(hub_id, token=token)
description = """Corpus Creator is a tool for transforming a collection of text files into a Hugging Face dataset, perfect for various natural language processing (NLP) tasks. Whether you're preparing data for synthetic generation, building pipelines, or setting up annotation tasks, this app simplifies the process.
Key features:
- 🗂️ Reads popular document formats (PDF, DOCX, PPTX, HTML, AsciiDoc, Markdown)
- ✂️ Customizable text chunking
- 👁️ Instant dataset preview
- 🚀 One-click upload to Hugging Face Hub
#### Powered by Llama Index and Docling
Corpus Creator leverages the power of Llama Index, a data framework for LLM-based applications. Specifically, we use Llama Index's `SentenceSplitter` class to intelligently chunk your text. This ensures that your dataset is split in a way that preserves semantic meaning, making it ideal for downstream NLP tasks. [Learn more about Llama Index](https://www.llamaindex.ai/).
Docling is a tool for converting documents to text. It supports a wide range of document formats, including PDF, DOCX, PPTX, Images, HTML, AsciiDoc, and Markdown. [Learn more about Docling](https://ds4sd.github.io/docling/).
Get started by uploading your files and see your corpus take shape!
[View an example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created with Corpus Creator.
"""
with gr.Blocks() as demo:
state = gr.State({})
gr.HTML(
"""<h1 style='text-align: center;'> Corpus Creator</h1>
<center><i> 📁 From scattered files to a structured dataset in minutes 📁 </i></center>"""
)
gr.Markdown(description)
gr.Markdown(
"### Sign in to Hugging Face Hub if you want to upload the dataset to the Hub"
)
gr.LoginButton()
gr.Markdown(
"### 1. Upload Files\nClick 'Upload Files' to select text file(s). A preview will generate automatically"
)
with gr.Tab():
with gr.Row():
upload_button = gr.File(
file_types=[
".hwp",
".pdf",
".docx",
".pptx",
".ppt",
".pptm",
".csv",
".epub",
".md",
".mbox",
".ipynb",
".xls",
".xlsx",
],
file_count="multiple",
height=50,
interactive=True,
label="Upload Files",
)
with gr.Tab():
with gr.Row():
urls = gr.Dataframe(label="URL", headers=["urls"], interactive=True)
with gr.Row():
upload_button_files = gr.Button("Upload URLs")
gr.Markdown("""
### 2. Adjust Parameters for Chunking Text (Optional)
Customize the chunk size, overlap, and sentence splitting option according to your requirements.
""")
with gr.Row():
split_sentences = gr.Checkbox(True, label="Split sentences?")
chunk_size = gr.Number(
256,
label="Chunk size (size to split text into)",
minimum=10,
maximum=4096,
step=1,
)
chunk_overlap = gr.Number(
0,
label="Chunk overlap (overlap size between chunks)",
minimum=0,
maximum=4096,
step=1,
)
gr.Markdown(
"### 3. Update Preview\nClick 'Update Preview' to see changes based on new parameters."
)
update_preview_button = gr.Button("Update Preview")
corpus_preview_df = gr.DataFrame(label="Dataset Preview")
preview_summary = gr.Markdown()
gr.Markdown("""### 4. Upload to the Hub
After adjusting parameters and previewing the dataset, you can upload it to the Hugging Face Hub. Make sure you are signed in to your Hugging Face account. Specify the Hub ID and choose whether to make the dataset private. Click 'Upload to Hub' to complete the process.
""")
with gr.Row():
with gr.Column():
hub_id = gr.Textbox(value=None, label="Hub ID")
private = gr.Checkbox(False, label="Upload dataset to a private repo?")
upload_hub_button = gr.Button("Upload to the Hub")
upload_summary = gr.Markdown()
with gr.Accordion("detailed logs", open=False):
Log(log_file, dark=True, xterm_font_size=12)
upload_button.upload(
upload_and_preview,
inputs=[upload_button, chunk_size, chunk_overlap, split_sentences],
outputs=[state, corpus_preview_df, preview_summary],
)
upload_button_files.click(
upload_and_preview,
inputs=[urls, chunk_size, chunk_overlap, split_sentences],
outputs=[state, corpus_preview_df, preview_summary],
)
update_preview_button.click(
preview_dataset,
inputs=[state, chunk_size, chunk_overlap, split_sentences],
outputs=[state, corpus_preview_df, preview_summary],
)
upload_hub_button.click(
upload_to_hub,
inputs=[state, hub_id, private],
outputs=[upload_summary],
)
demo.launch(debug=True)
|