Quentin Lhoest commited on
Commit
6356cbd
·
1 Parent(s): c264dfc

load local only dataset by default

Browse files
Files changed (1) hide show
  1. tagging_app.py +14 -10
tagging_app.py CHANGED
@@ -3,12 +3,18 @@ import datasets
3
  import json
4
  import os
5
  import streamlit as st
 
6
  import yaml
7
-
8
  from dataclasses import asdict
 
 
 
9
  from glob import glob
10
  from os.path import join as pjoin
11
 
 
 
 
12
  st.set_page_config(
13
  page_title="HF Dataset Tagging App",
14
  page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
@@ -132,7 +138,7 @@ def load_all_dataset_infos(dataset_list):
132
  def load_existing_tags():
133
  has_tags = {}
134
  for fname in glob("saved_tags/*/*/tags.json"):
135
- _, did, cid, _ = fname.split('/')
136
  has_tags[did] = has_tags.get(did, {})
137
  has_tags[did][cid] = fname
138
  return has_tags
@@ -160,15 +166,16 @@ to pre-load the tag sets from another dataset or configuration to avoid too much
160
  The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
161
  """
162
 
163
- all_dataset_ids = copy.deepcopy(get_dataset_list())
164
  existing_tag_sets = load_existing_tags()
165
- all_dataset_infos = load_all_dataset_infos(all_dataset_ids)
 
166
 
167
  st.sidebar.markdown(app_desc)
168
 
169
  # option to only select from datasets that still need to be annotated
170
  only_missing = st.sidebar.checkbox("Show only un-annotated configs")
171
 
 
172
  if only_missing:
173
  dataset_choose_list = ["local dataset"] + [did for did, c_dict in all_dataset_infos.items()
174
  if not all([cid in existing_tag_sets.get(did, {}) for cid in c_dict])]
@@ -181,9 +188,10 @@ dataset_id = st.sidebar.selectbox(
181
  index=0,
182
  )
183
 
 
184
  if dataset_id == "local dataset":
185
  path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
186
- if path_to_info not in ["/path/to/dataset/", ""]:
187
  dataset_infos = json.load(open(pjoin(path_to_info, "dataset_infos.json")))
188
  confs = dataset_infos.keys()
189
  all_info_dicts = {}
@@ -202,8 +210,6 @@ if dataset_id == "local dataset":
202
  'splits': {},
203
  }
204
  }
205
- else:
206
- all_info_dicts = all_dataset_infos[dataset_id]
207
 
208
  if only_missing:
209
  config_choose_list = [cid for cid in all_info_dicts
@@ -249,8 +255,6 @@ c2.markdown(f"### Writing tags for: {dataset_id} / {config_id}")
249
  ##########
250
  c2.markdown("#### Pre-loading an existing tag set")
251
 
252
- existing_tag_sets = load_existing_tags()
253
-
254
  pre_loaded = {
255
  "task_categories": [],
256
  "task_ids": [],
@@ -442,7 +446,7 @@ with c3.beta_expander("Show JSON output for the current config"):
442
 
443
  with c3.beta_expander("Show YAML output aggregating the tags saved for all configs"):
444
  task_saved_configs = dict([
445
- (fname.split('/')[-2], json.load(open(fname)))
446
  for fname in glob(f"saved_tags/{dataset_id}/*/tags.json")
447
  ])
448
  aggregate_config = {}
 
3
  import json
4
  import os
5
  import streamlit as st
6
+ import sys
7
  import yaml
 
8
  from dataclasses import asdict
9
+ from pathlib import Path
10
+ from typing import Dict
11
+
12
  from glob import glob
13
  from os.path import join as pjoin
14
 
15
+
16
+ load_remote_datasets = "--load_remote_datasets" in sys.argv[1:]
17
+
18
  st.set_page_config(
19
  page_title="HF Dataset Tagging App",
20
  page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
 
138
  def load_existing_tags():
139
  has_tags = {}
140
  for fname in glob("saved_tags/*/*/tags.json"):
141
+ _, did, cid, _ = fname.split(os.sep)
142
  has_tags[did] = has_tags.get(did, {})
143
  has_tags[did][cid] = fname
144
  return has_tags
 
166
  The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
167
  """
168
 
 
169
  existing_tag_sets = load_existing_tags()
170
+ all_dataset_ids = list(existing_tag_sets.keys()) if not load_remote_datasets else copy.deepcopy(get_dataset_list())
171
+ all_dataset_infos = {} if not load_remote_datasets else load_all_dataset_infos(all_dataset_ids)
172
 
173
  st.sidebar.markdown(app_desc)
174
 
175
  # option to only select from datasets that still need to be annotated
176
  only_missing = st.sidebar.checkbox("Show only un-annotated configs")
177
 
178
+
179
  if only_missing:
180
  dataset_choose_list = ["local dataset"] + [did for did, c_dict in all_dataset_infos.items()
181
  if not all([cid in existing_tag_sets.get(did, {}) for cid in c_dict])]
 
188
  index=0,
189
  )
190
 
191
+ all_info_dicts = {}
192
  if dataset_id == "local dataset":
193
  path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
194
+ if path_to_info != "/path/to/dataset/":
195
  dataset_infos = json.load(open(pjoin(path_to_info, "dataset_infos.json")))
196
  confs = dataset_infos.keys()
197
  all_info_dicts = {}
 
210
  'splits': {},
211
  }
212
  }
 
 
213
 
214
  if only_missing:
215
  config_choose_list = [cid for cid in all_info_dicts
 
255
  ##########
256
  c2.markdown("#### Pre-loading an existing tag set")
257
 
 
 
258
  pre_loaded = {
259
  "task_categories": [],
260
  "task_ids": [],
 
446
 
447
  with c3.beta_expander("Show YAML output aggregating the tags saved for all configs"):
448
  task_saved_configs = dict([
449
+ (Path(fname).parent.name, json.load(open(fname)))
450
  for fname in glob(f"saved_tags/{dataset_id}/*/tags.json")
451
  ])
452
  aggregate_config = {}