lhoestq HF staff commited on
Commit
daa36b9
Β·
1 Parent(s): 8908f22

initial commit

Browse files
Files changed (3) hide show
  1. README.md +11 -3
  2. app.py +186 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Datasets Slices
3
- emoji: πŸ‘€
4
  colorFrom: pink
5
  colorTo: purple
6
  sdk: gradio
@@ -9,4 +9,12 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Datasets Explorer
3
+ emoji: πŸ“–
4
  colorFrom: pink
5
  colorTo: purple
6
  sdk: gradio
 
9
  pinned: false
10
  ---
11
 
12
+ # πŸ“– Dataset Explorer
13
+
14
+ Access any slice of data of any dataset on the [Hugging Face Dataset Hub](https://huggingface.co/datasets)
15
+
16
+ Run:
17
+
18
+ ```python
19
+ gradio app.py
20
+ ```
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from functools import lru_cache
3
+ from hffs.fs import HfFileSystem
4
+ from typing import List, Tuple, Callable
5
+ import pandas as pd
6
+ import numpy as np
7
+ import pyarrow as pa
8
+ import pyarrow.parquet as pq
9
+ from functools import partial
10
+ from io import StringIO
11
+ from tqdm.contrib.concurrent import thread_map
12
+ from datasets import Features
13
+
14
+
15
+
16
+ class AppError(RuntimeError):
17
+ pass
18
+
19
+
20
+ PAGE_SIZE = 20
21
+
22
+
23
+ @lru_cache(maxsize=128)
24
+ def get_parquet_fs(dataset: str) -> HfFileSystem:
25
+ try:
26
+ fs = HfFileSystem(dataset, repo_type="dataset", revision="refs/convert/parquet")
27
+ if any(fs.isfile(path) for path in fs.ls("") if not path.startswith(".")):
28
+ raise AppError(f"Parquet export doesn't exist for '{dataset}'.")
29
+ return fs
30
+ except:
31
+ raise AppError(f"Parquet export doesn't exist for '{dataset}'.")
32
+
33
+
34
+ @lru_cache(maxsize=128)
35
+ def get_parquet_configs(dataset: str) -> List[str]:
36
+ fs = get_parquet_fs(dataset)
37
+ return [path for path in fs.ls("") if fs.isdir(path)]
38
+
39
+
40
+ def _sorted_split_key(split: str) -> str:
41
+ return split if not split.startswith("train") else chr(0) + split # always "train" first
42
+
43
+
44
+ @lru_cache(maxsize=128)
45
+ def get_parquet_splits(dataset: str, config: str) -> List[str]:
46
+ fs = get_parquet_fs(dataset)
47
+ all_parts = [path.rsplit(".", 1)[0].split("-") for path in fs.glob(f"{config}/*.parquet")]
48
+ return sorted(set(parts[-4] if len(parts) > 3 and parts[-2] == "of" else parts[-1] for parts in all_parts), key=_sorted_split_key)
49
+
50
+ def sanitize_inputs(dataset: str, config: str, split: str, page: str) -> Tuple[str, str, str, int]:
51
+ try:
52
+ page = int(page)
53
+ assert page > 0
54
+ except:
55
+ raise AppError(f"Bad page: {page}")
56
+ if not dataset:
57
+ raise AppError("Empty dataset name")
58
+ if not config:
59
+ raise AppError(f"Empty config. Available configs are: {', '.join(get_parquet_configs(dataset))}.")
60
+ if not split:
61
+ raise AppError(f"Empty split. Available splits are: {', '.join(get_parquet_splits(dataset, config))}.")
62
+ return dataset, config, split, int(page)
63
+
64
+
65
+ RowGroupReaders = List[Callable[[], pa.Table]]
66
+
67
+
68
+ @lru_cache(maxsize=128)
69
+ def index(dataset: str, config: str, split: str) -> Tuple[np.ndarray, RowGroupReaders, int, str]:
70
+ fs = get_parquet_fs(dataset)
71
+ sources = fs.glob(f"{config}/*-{split}.parquet") + fs.glob(f"{config}/*-{split}-*-of-*.parquet")
72
+ if not sources:
73
+ if config not in get_parquet_configs(dataset):
74
+ raise AppError(f"Invalid config {config}. Available configs are: {', '.join(get_parquet_configs(dataset))}.")
75
+ else:
76
+ raise AppError(f"Invalid split {split}. Available splits are: {', '.join(get_parquet_splits(dataset, config))}.")
77
+ all_pf: List[pq.ParquetFile] = thread_map(partial(pq.ParquetFile, filesystem=fs), sources)
78
+ features = Features.from_arrow_schema(all_pf[0].schema.to_arrow_schema())
79
+ columns = [col for col in features if all(bad_type not in str(features[col]) for bad_type in ["Image(", "Audio(", "'binary'"])]
80
+ info = "" if len(columns) == len(features) else f"Some columns are not supported yet: {sorted(set(features) - set(columns))}"
81
+ rg_offsets = np.cumsum([pf.metadata.row_group(i).num_rows for pf in all_pf for i in range(pf.metadata.num_row_groups)])
82
+ rg_readers = [partial(pf.read_row_group, i, columns=columns) for pf in all_pf for i in range(pf.metadata.num_row_groups)]
83
+ max_page = rg_offsets[-1] // PAGE_SIZE
84
+ return rg_offsets, rg_readers, max_page, info
85
+
86
+
87
+ def query(page: int, page_size: int, rg_offsets: np.ndarray, rg_readers: RowGroupReaders) -> pd.DataFrame:
88
+ start_row, end_row = (page - 1) * page_size, page * page_size
89
+ start_rg, end_rg = np.searchsorted(rg_offsets, [start_row, end_row], side="right")
90
+ if page < 1 or end_rg >= len(rg_readers):
91
+ raise AppError(f"Page {page} does not exist")
92
+ pa_table = pa.concat_tables([rg_readers[i]() for i in range(start_rg, end_rg + 1)])
93
+ offset = start_row - rg_offsets[start_rg - 1] if start_rg else start_row
94
+ pa_table = pa_table.slice(offset, end_row - start_row)
95
+ return pa_table.to_pandas()
96
+
97
+
98
+ @lru_cache(maxsize=128)
99
+ def get_page(dataset: str, config: str, split: str, page: str) -> Tuple[str, int, str]:
100
+ dataset, config, split, page = sanitize_inputs(dataset, config, split, page)
101
+ rg_offsets, rg_readers, max_page, info = index(dataset, config, split)
102
+ df = query(page, PAGE_SIZE, rg_offsets=rg_offsets, rg_readers=rg_readers)
103
+ buf = StringIO()
104
+ df.to_json(buf, lines=True, orient="records")
105
+ return buf.getvalue(), max_page, info
106
+
107
+
108
+ with gr.Blocks() as demo:
109
+ gr.Markdown("# πŸ“– Dataset Explorer\n\nAccess any slice of data of any dataset on the [Hugging Face Dataset Hub](https://huggingface.co/datasets)")
110
+ cp_dataset = gr.Textbox("squad", label="Pick a dataset", placeholder="squad")
111
+ cp_go = gr.Button("Explore")
112
+ cp_config = gr.Dropdown(["plain_text"], value="plain_text", label="Config", visible=False)
113
+ cp_split = gr.Dropdown(["train", "validation"], value="train", label="Split", visible=False)
114
+ with gr.Row():
115
+ cp_page = gr.Textbox("1", label="Page", placeholder="1", visible=False)
116
+ cp_goto_page = gr.Button("Go to page", visible=False)
117
+ cp_error = gr.Markdown("", visible=False)
118
+ cp_info = gr.Markdown("", visible=False)
119
+ cp_result = gr.Markdown("", visible=False)
120
+
121
+ def show_error(message: str) -> dict():
122
+ return {
123
+ cp_error: gr.update(visible=True, value=f"## ❌ Error:\n\n{message}"),
124
+ cp_info: gr.update(visible=False, value=""),
125
+ cp_result: gr.update(visible=False, value=""),
126
+ }
127
+
128
+ def show_dataset_at_config_and_split_and_page(dataset: str, config: str, split: str, page: str) -> dict:
129
+ try:
130
+ jsonl_result, max_page, info = get_page(dataset, config, split, page)
131
+ info = f"({info})" if info else ""
132
+ return {
133
+ cp_result: gr.update(visible=True, value=f"```json\n{jsonl_result}\n```"),
134
+ cp_info: gr.update(visible=True, value=f"Page {page}/{max_page}) {info}"),
135
+ cp_error: gr.update(visible=False, value="")
136
+ }
137
+ except AppError as err:
138
+ return show_error(str(err))
139
+
140
+ def show_dataset_at_config_and_split(dataset: str, config: str, split: str) -> dict:
141
+ try:
142
+ return {
143
+ **show_dataset_at_config_and_split_and_page(dataset, config, split, "1"),
144
+ cp_page: gr.update(value="1", visible=True),
145
+ cp_goto_page: gr.update(visible=True),
146
+ }
147
+ except AppError as err:
148
+ return show_error(str(err))
149
+
150
+ def show_dataset_at_config(dataset: str, config: str) -> dict:
151
+ try:
152
+ splits = get_parquet_splits(dataset, config)
153
+ if not splits:
154
+ raise AppError(f"Dataset {dataset} with config {config} has no splits.")
155
+ else:
156
+ split = splits[0]
157
+ return {
158
+ **show_dataset_at_config_and_split(dataset, config, split),
159
+ cp_split: gr.update(value=split, choices=splits, visible=len(splits) > 1),
160
+ }
161
+ except AppError as err:
162
+ return show_error(str(err))
163
+
164
+ def show_dataset(dataset: str) -> dict:
165
+ try:
166
+ configs = get_parquet_configs(dataset)
167
+ if not configs:
168
+ raise AppError(f"Dataset {dataset} has no configs.")
169
+ else:
170
+ config = configs[0]
171
+ return {
172
+ **show_dataset_at_config(dataset, config),
173
+ cp_config: gr.update(value=config, choices=configs, visible=len(configs) > 1),
174
+ }
175
+ except AppError as err:
176
+ return show_error(str(err))
177
+
178
+ all_outputs = [cp_config, cp_split, cp_page, cp_goto_page, cp_result, cp_info, cp_error]
179
+ cp_go.click(show_dataset, inputs=[cp_dataset], outputs=all_outputs)
180
+ cp_config.change(show_dataset_at_config, inputs=[cp_dataset, cp_config], outputs=all_outputs)
181
+ cp_split.change(show_dataset_at_config_and_split, inputs=[cp_dataset, cp_config, cp_split], outputs=all_outputs)
182
+ cp_goto_page.click(show_dataset_at_config_and_split_and_page, inputs=[cp_dataset, cp_config, cp_split, cp_page], outputs=all_outputs)
183
+
184
+
185
+ if __name__ == "__main__":
186
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pyarrow
2
+ fsspec[http]
3
+ tqdm
4
+ datasets
5
+ git+https://github.com/huggingface/hffs.git