Spaces:
Running
Running
Quentin Lhoest
commited on
Commit
·
6356cbd
1
Parent(s):
c264dfc
load local only dataset by default
Browse files- tagging_app.py +14 -10
tagging_app.py
CHANGED
@@ -3,12 +3,18 @@ import datasets
|
|
3 |
import json
|
4 |
import os
|
5 |
import streamlit as st
|
|
|
6 |
import yaml
|
7 |
-
|
8 |
from dataclasses import asdict
|
|
|
|
|
|
|
9 |
from glob import glob
|
10 |
from os.path import join as pjoin
|
11 |
|
|
|
|
|
|
|
12 |
st.set_page_config(
|
13 |
page_title="HF Dataset Tagging App",
|
14 |
page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
|
@@ -132,7 +138,7 @@ def load_all_dataset_infos(dataset_list):
|
|
132 |
def load_existing_tags():
|
133 |
has_tags = {}
|
134 |
for fname in glob("saved_tags/*/*/tags.json"):
|
135 |
-
_, did, cid, _ = fname.split(
|
136 |
has_tags[did] = has_tags.get(did, {})
|
137 |
has_tags[did][cid] = fname
|
138 |
return has_tags
|
@@ -160,15 +166,16 @@ to pre-load the tag sets from another dataset or configuration to avoid too much
|
|
160 |
The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
|
161 |
"""
|
162 |
|
163 |
-
all_dataset_ids = copy.deepcopy(get_dataset_list())
|
164 |
existing_tag_sets = load_existing_tags()
|
165 |
-
|
|
|
166 |
|
167 |
st.sidebar.markdown(app_desc)
|
168 |
|
169 |
# option to only select from datasets that still need to be annotated
|
170 |
only_missing = st.sidebar.checkbox("Show only un-annotated configs")
|
171 |
|
|
|
172 |
if only_missing:
|
173 |
dataset_choose_list = ["local dataset"] + [did for did, c_dict in all_dataset_infos.items()
|
174 |
if not all([cid in existing_tag_sets.get(did, {}) for cid in c_dict])]
|
@@ -181,9 +188,10 @@ dataset_id = st.sidebar.selectbox(
|
|
181 |
index=0,
|
182 |
)
|
183 |
|
|
|
184 |
if dataset_id == "local dataset":
|
185 |
path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
|
186 |
-
if path_to_info
|
187 |
dataset_infos = json.load(open(pjoin(path_to_info, "dataset_infos.json")))
|
188 |
confs = dataset_infos.keys()
|
189 |
all_info_dicts = {}
|
@@ -202,8 +210,6 @@ if dataset_id == "local dataset":
|
|
202 |
'splits': {},
|
203 |
}
|
204 |
}
|
205 |
-
else:
|
206 |
-
all_info_dicts = all_dataset_infos[dataset_id]
|
207 |
|
208 |
if only_missing:
|
209 |
config_choose_list = [cid for cid in all_info_dicts
|
@@ -249,8 +255,6 @@ c2.markdown(f"### Writing tags for: {dataset_id} / {config_id}")
|
|
249 |
##########
|
250 |
c2.markdown("#### Pre-loading an existing tag set")
|
251 |
|
252 |
-
existing_tag_sets = load_existing_tags()
|
253 |
-
|
254 |
pre_loaded = {
|
255 |
"task_categories": [],
|
256 |
"task_ids": [],
|
@@ -442,7 +446,7 @@ with c3.beta_expander("Show JSON output for the current config"):
|
|
442 |
|
443 |
with c3.beta_expander("Show YAML output aggregating the tags saved for all configs"):
|
444 |
task_saved_configs = dict([
|
445 |
-
(fname.
|
446 |
for fname in glob(f"saved_tags/{dataset_id}/*/tags.json")
|
447 |
])
|
448 |
aggregate_config = {}
|
|
|
3 |
import json
|
4 |
import os
|
5 |
import streamlit as st
|
6 |
+
import sys
|
7 |
import yaml
|
|
|
8 |
from dataclasses import asdict
|
9 |
+
from pathlib import Path
|
10 |
+
from typing import Dict
|
11 |
+
|
12 |
from glob import glob
|
13 |
from os.path import join as pjoin
|
14 |
|
15 |
+
|
16 |
+
load_remote_datasets = "--load_remote_datasets" in sys.argv[1:]
|
17 |
+
|
18 |
st.set_page_config(
|
19 |
page_title="HF Dataset Tagging App",
|
20 |
page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
|
|
|
138 |
def load_existing_tags():
|
139 |
has_tags = {}
|
140 |
for fname in glob("saved_tags/*/*/tags.json"):
|
141 |
+
_, did, cid, _ = fname.split(os.sep)
|
142 |
has_tags[did] = has_tags.get(did, {})
|
143 |
has_tags[did][cid] = fname
|
144 |
return has_tags
|
|
|
166 |
The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
|
167 |
"""
|
168 |
|
|
|
169 |
existing_tag_sets = load_existing_tags()
|
170 |
+
all_dataset_ids = list(existing_tag_sets.keys()) if not load_remote_datasets else copy.deepcopy(get_dataset_list())
|
171 |
+
all_dataset_infos = {} if not load_remote_datasets else load_all_dataset_infos(all_dataset_ids)
|
172 |
|
173 |
st.sidebar.markdown(app_desc)
|
174 |
|
175 |
# option to only select from datasets that still need to be annotated
|
176 |
only_missing = st.sidebar.checkbox("Show only un-annotated configs")
|
177 |
|
178 |
+
|
179 |
if only_missing:
|
180 |
dataset_choose_list = ["local dataset"] + [did for did, c_dict in all_dataset_infos.items()
|
181 |
if not all([cid in existing_tag_sets.get(did, {}) for cid in c_dict])]
|
|
|
188 |
index=0,
|
189 |
)
|
190 |
|
191 |
+
all_info_dicts = {}
|
192 |
if dataset_id == "local dataset":
|
193 |
path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
|
194 |
+
if path_to_info != "/path/to/dataset/":
|
195 |
dataset_infos = json.load(open(pjoin(path_to_info, "dataset_infos.json")))
|
196 |
confs = dataset_infos.keys()
|
197 |
all_info_dicts = {}
|
|
|
210 |
'splits': {},
|
211 |
}
|
212 |
}
|
|
|
|
|
213 |
|
214 |
if only_missing:
|
215 |
config_choose_list = [cid for cid in all_info_dicts
|
|
|
255 |
##########
|
256 |
c2.markdown("#### Pre-loading an existing tag set")
|
257 |
|
|
|
|
|
258 |
pre_loaded = {
|
259 |
"task_categories": [],
|
260 |
"task_ids": [],
|
|
|
446 |
|
447 |
with c3.beta_expander("Show YAML output aggregating the tags saved for all configs"):
|
448 |
task_saved_configs = dict([
|
449 |
+
(Path(fname).parent.name, json.load(open(fname)))
|
450 |
for fname in glob(f"saved_tags/{dataset_id}/*/tags.json")
|
451 |
])
|
452 |
aggregate_config = {}
|