Spaces:
Running
Running
initial commit
Browse files- README.md +11 -3
- app.py +186 -0
- requirements.txt +5 -0
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
title: Datasets
|
3 |
-
emoji:
|
4 |
colorFrom: pink
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
@@ -9,4 +9,12 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Datasets Explorer
|
3 |
+
emoji: π
|
4 |
colorFrom: pink
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
# π Dataset Explorer
|
13 |
+
|
14 |
+
Access any slice of data of any dataset on the [Hugging Face Dataset Hub](https://huggingface.co/datasets)
|
15 |
+
|
16 |
+
Run:
|
17 |
+
|
18 |
+
```python
|
19 |
+
gradio app.py
|
20 |
+
```
|
app.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from functools import lru_cache
|
3 |
+
from hffs.fs import HfFileSystem
|
4 |
+
from typing import List, Tuple, Callable
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
import pyarrow as pa
|
8 |
+
import pyarrow.parquet as pq
|
9 |
+
from functools import partial
|
10 |
+
from io import StringIO
|
11 |
+
from tqdm.contrib.concurrent import thread_map
|
12 |
+
from datasets import Features
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
class AppError(RuntimeError):
|
17 |
+
pass
|
18 |
+
|
19 |
+
|
20 |
+
PAGE_SIZE = 20
|
21 |
+
|
22 |
+
|
23 |
+
@lru_cache(maxsize=128)
|
24 |
+
def get_parquet_fs(dataset: str) -> HfFileSystem:
|
25 |
+
try:
|
26 |
+
fs = HfFileSystem(dataset, repo_type="dataset", revision="refs/convert/parquet")
|
27 |
+
if any(fs.isfile(path) for path in fs.ls("") if not path.startswith(".")):
|
28 |
+
raise AppError(f"Parquet export doesn't exist for '{dataset}'.")
|
29 |
+
return fs
|
30 |
+
except:
|
31 |
+
raise AppError(f"Parquet export doesn't exist for '{dataset}'.")
|
32 |
+
|
33 |
+
|
34 |
+
@lru_cache(maxsize=128)
|
35 |
+
def get_parquet_configs(dataset: str) -> List[str]:
|
36 |
+
fs = get_parquet_fs(dataset)
|
37 |
+
return [path for path in fs.ls("") if fs.isdir(path)]
|
38 |
+
|
39 |
+
|
40 |
+
def _sorted_split_key(split: str) -> str:
|
41 |
+
return split if not split.startswith("train") else chr(0) + split # always "train" first
|
42 |
+
|
43 |
+
|
44 |
+
@lru_cache(maxsize=128)
|
45 |
+
def get_parquet_splits(dataset: str, config: str) -> List[str]:
|
46 |
+
fs = get_parquet_fs(dataset)
|
47 |
+
all_parts = [path.rsplit(".", 1)[0].split("-") for path in fs.glob(f"{config}/*.parquet")]
|
48 |
+
return sorted(set(parts[-4] if len(parts) > 3 and parts[-2] == "of" else parts[-1] for parts in all_parts), key=_sorted_split_key)
|
49 |
+
|
50 |
+
def sanitize_inputs(dataset: str, config: str, split: str, page: str) -> Tuple[str, str, str, int]:
|
51 |
+
try:
|
52 |
+
page = int(page)
|
53 |
+
assert page > 0
|
54 |
+
except:
|
55 |
+
raise AppError(f"Bad page: {page}")
|
56 |
+
if not dataset:
|
57 |
+
raise AppError("Empty dataset name")
|
58 |
+
if not config:
|
59 |
+
raise AppError(f"Empty config. Available configs are: {', '.join(get_parquet_configs(dataset))}.")
|
60 |
+
if not split:
|
61 |
+
raise AppError(f"Empty split. Available splits are: {', '.join(get_parquet_splits(dataset, config))}.")
|
62 |
+
return dataset, config, split, int(page)
|
63 |
+
|
64 |
+
|
65 |
+
RowGroupReaders = List[Callable[[], pa.Table]]
|
66 |
+
|
67 |
+
|
68 |
+
@lru_cache(maxsize=128)
|
69 |
+
def index(dataset: str, config: str, split: str) -> Tuple[np.ndarray, RowGroupReaders, int, str]:
|
70 |
+
fs = get_parquet_fs(dataset)
|
71 |
+
sources = fs.glob(f"{config}/*-{split}.parquet") + fs.glob(f"{config}/*-{split}-*-of-*.parquet")
|
72 |
+
if not sources:
|
73 |
+
if config not in get_parquet_configs(dataset):
|
74 |
+
raise AppError(f"Invalid config {config}. Available configs are: {', '.join(get_parquet_configs(dataset))}.")
|
75 |
+
else:
|
76 |
+
raise AppError(f"Invalid split {split}. Available splits are: {', '.join(get_parquet_splits(dataset, config))}.")
|
77 |
+
all_pf: List[pq.ParquetFile] = thread_map(partial(pq.ParquetFile, filesystem=fs), sources)
|
78 |
+
features = Features.from_arrow_schema(all_pf[0].schema.to_arrow_schema())
|
79 |
+
columns = [col for col in features if all(bad_type not in str(features[col]) for bad_type in ["Image(", "Audio(", "'binary'"])]
|
80 |
+
info = "" if len(columns) == len(features) else f"Some columns are not supported yet: {sorted(set(features) - set(columns))}"
|
81 |
+
rg_offsets = np.cumsum([pf.metadata.row_group(i).num_rows for pf in all_pf for i in range(pf.metadata.num_row_groups)])
|
82 |
+
rg_readers = [partial(pf.read_row_group, i, columns=columns) for pf in all_pf for i in range(pf.metadata.num_row_groups)]
|
83 |
+
max_page = rg_offsets[-1] // PAGE_SIZE
|
84 |
+
return rg_offsets, rg_readers, max_page, info
|
85 |
+
|
86 |
+
|
87 |
+
def query(page: int, page_size: int, rg_offsets: np.ndarray, rg_readers: RowGroupReaders) -> pd.DataFrame:
|
88 |
+
start_row, end_row = (page - 1) * page_size, page * page_size
|
89 |
+
start_rg, end_rg = np.searchsorted(rg_offsets, [start_row, end_row], side="right")
|
90 |
+
if page < 1 or end_rg >= len(rg_readers):
|
91 |
+
raise AppError(f"Page {page} does not exist")
|
92 |
+
pa_table = pa.concat_tables([rg_readers[i]() for i in range(start_rg, end_rg + 1)])
|
93 |
+
offset = start_row - rg_offsets[start_rg - 1] if start_rg else start_row
|
94 |
+
pa_table = pa_table.slice(offset, end_row - start_row)
|
95 |
+
return pa_table.to_pandas()
|
96 |
+
|
97 |
+
|
98 |
+
@lru_cache(maxsize=128)
|
99 |
+
def get_page(dataset: str, config: str, split: str, page: str) -> Tuple[str, int, str]:
|
100 |
+
dataset, config, split, page = sanitize_inputs(dataset, config, split, page)
|
101 |
+
rg_offsets, rg_readers, max_page, info = index(dataset, config, split)
|
102 |
+
df = query(page, PAGE_SIZE, rg_offsets=rg_offsets, rg_readers=rg_readers)
|
103 |
+
buf = StringIO()
|
104 |
+
df.to_json(buf, lines=True, orient="records")
|
105 |
+
return buf.getvalue(), max_page, info
|
106 |
+
|
107 |
+
|
108 |
+
with gr.Blocks() as demo:
|
109 |
+
gr.Markdown("# π Dataset Explorer\n\nAccess any slice of data of any dataset on the [Hugging Face Dataset Hub](https://huggingface.co/datasets)")
|
110 |
+
cp_dataset = gr.Textbox("squad", label="Pick a dataset", placeholder="squad")
|
111 |
+
cp_go = gr.Button("Explore")
|
112 |
+
cp_config = gr.Dropdown(["plain_text"], value="plain_text", label="Config", visible=False)
|
113 |
+
cp_split = gr.Dropdown(["train", "validation"], value="train", label="Split", visible=False)
|
114 |
+
with gr.Row():
|
115 |
+
cp_page = gr.Textbox("1", label="Page", placeholder="1", visible=False)
|
116 |
+
cp_goto_page = gr.Button("Go to page", visible=False)
|
117 |
+
cp_error = gr.Markdown("", visible=False)
|
118 |
+
cp_info = gr.Markdown("", visible=False)
|
119 |
+
cp_result = gr.Markdown("", visible=False)
|
120 |
+
|
121 |
+
def show_error(message: str) -> dict():
|
122 |
+
return {
|
123 |
+
cp_error: gr.update(visible=True, value=f"## β Error:\n\n{message}"),
|
124 |
+
cp_info: gr.update(visible=False, value=""),
|
125 |
+
cp_result: gr.update(visible=False, value=""),
|
126 |
+
}
|
127 |
+
|
128 |
+
def show_dataset_at_config_and_split_and_page(dataset: str, config: str, split: str, page: str) -> dict:
|
129 |
+
try:
|
130 |
+
jsonl_result, max_page, info = get_page(dataset, config, split, page)
|
131 |
+
info = f"({info})" if info else ""
|
132 |
+
return {
|
133 |
+
cp_result: gr.update(visible=True, value=f"```json\n{jsonl_result}\n```"),
|
134 |
+
cp_info: gr.update(visible=True, value=f"Page {page}/{max_page}) {info}"),
|
135 |
+
cp_error: gr.update(visible=False, value="")
|
136 |
+
}
|
137 |
+
except AppError as err:
|
138 |
+
return show_error(str(err))
|
139 |
+
|
140 |
+
def show_dataset_at_config_and_split(dataset: str, config: str, split: str) -> dict:
|
141 |
+
try:
|
142 |
+
return {
|
143 |
+
**show_dataset_at_config_and_split_and_page(dataset, config, split, "1"),
|
144 |
+
cp_page: gr.update(value="1", visible=True),
|
145 |
+
cp_goto_page: gr.update(visible=True),
|
146 |
+
}
|
147 |
+
except AppError as err:
|
148 |
+
return show_error(str(err))
|
149 |
+
|
150 |
+
def show_dataset_at_config(dataset: str, config: str) -> dict:
|
151 |
+
try:
|
152 |
+
splits = get_parquet_splits(dataset, config)
|
153 |
+
if not splits:
|
154 |
+
raise AppError(f"Dataset {dataset} with config {config} has no splits.")
|
155 |
+
else:
|
156 |
+
split = splits[0]
|
157 |
+
return {
|
158 |
+
**show_dataset_at_config_and_split(dataset, config, split),
|
159 |
+
cp_split: gr.update(value=split, choices=splits, visible=len(splits) > 1),
|
160 |
+
}
|
161 |
+
except AppError as err:
|
162 |
+
return show_error(str(err))
|
163 |
+
|
164 |
+
def show_dataset(dataset: str) -> dict:
|
165 |
+
try:
|
166 |
+
configs = get_parquet_configs(dataset)
|
167 |
+
if not configs:
|
168 |
+
raise AppError(f"Dataset {dataset} has no configs.")
|
169 |
+
else:
|
170 |
+
config = configs[0]
|
171 |
+
return {
|
172 |
+
**show_dataset_at_config(dataset, config),
|
173 |
+
cp_config: gr.update(value=config, choices=configs, visible=len(configs) > 1),
|
174 |
+
}
|
175 |
+
except AppError as err:
|
176 |
+
return show_error(str(err))
|
177 |
+
|
178 |
+
all_outputs = [cp_config, cp_split, cp_page, cp_goto_page, cp_result, cp_info, cp_error]
|
179 |
+
cp_go.click(show_dataset, inputs=[cp_dataset], outputs=all_outputs)
|
180 |
+
cp_config.change(show_dataset_at_config, inputs=[cp_dataset, cp_config], outputs=all_outputs)
|
181 |
+
cp_split.change(show_dataset_at_config_and_split, inputs=[cp_dataset, cp_config, cp_split], outputs=all_outputs)
|
182 |
+
cp_goto_page.click(show_dataset_at_config_and_split_and_page, inputs=[cp_dataset, cp_config, cp_split, cp_page], outputs=all_outputs)
|
183 |
+
|
184 |
+
|
185 |
+
if __name__ == "__main__":
|
186 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pyarrow
|
2 |
+
fsspec[http]
|
3 |
+
tqdm
|
4 |
+
datasets
|
5 |
+
git+https://github.com/huggingface/hffs.git
|