theo commited on
Commit
08a65ff
·
1 Parent(s): 927d443

black + simplification

Browse files
Files changed (1) hide show
  1. tagging_app.py +165 -381
tagging_app.py CHANGED
@@ -1,19 +1,11 @@
1
- import copy
2
- import datasets
3
  import json
4
  import os
5
- import streamlit as st
6
- import sys
7
- import yaml
8
  from dataclasses import asdict
9
- from pathlib import Path
10
- from typing import Dict
11
-
12
  from glob import glob
13
- from os.path import join as pjoin
14
 
15
-
16
- load_remote_datasets = "--load_remote_datasets" in sys.argv[1:]
 
17
 
18
  st.set_page_config(
19
  page_title="HF Dataset Tagging App",
@@ -56,110 +48,6 @@ creator_set = {
56
  ## Helper functions
57
  ########################
58
 
59
- @st.cache
60
- def filter_features(features, name="", is_sequence=False):
61
- if isinstance(features, list):
62
- return filter_features(features[0], name, is_sequence=True)
63
- if not isinstance(features, dict):
64
- return {}, []
65
- if features.get("_type", None) == 'Sequence':
66
- if "dtype" in features["feature"] or ("_type" in features["feature"] and features["feature"]["_type"] == "ClassLabel"):
67
- pre_filtered, desc = filter_features(features["feature"], name, is_sequence=True)
68
- filtered = {
69
- "feature_type": features["_type"],
70
- "feature": pre_filtered,
71
- }
72
- return filtered, desc
73
- else:
74
- filtered = {"feature_type": features["_type"]}
75
- if is_sequence:
76
- desc = [f"- `{name}`: a `list` of dictionary features containing:"]
77
- else:
78
- desc = [f"- `{name}`: a dictionary feature containing:"]
79
- for k, v in features["feature"].items():
80
- pre_filtered, pre_desc = filter_features(v, name=k)
81
- filtered[k] = pre_filtered
82
- desc += [" " + d for d in pre_desc]
83
- return filtered, desc
84
- elif features.get("_type", None) == 'Value':
85
- filtered = {
86
- "feature_type": features["_type"],
87
- "dtype": features["dtype"],
88
- }
89
- if is_sequence:
90
- desc = f"- `{name}`: a `list` of `{features['dtype']}` features."
91
- else:
92
- desc = f"- `{name}`: a `{features['dtype']}` feature."
93
- return filtered, [desc]
94
- elif features.get("_type", None) == 'ClassLabel':
95
- filtered = {
96
- "feature_type": features["_type"],
97
- "dtype": "int32",
98
- "class_names": features["names"],
99
- }
100
- if is_sequence:
101
- desc = f"- `{name}`: a `list` of classification labels, with possible values including {', '.join(['`'+nm+'`' for nm in features['names'][:5]])}."
102
- else:
103
- desc = f"- `{name}`: a classification label, with possible values including {', '.join(['`'+nm+'`' for nm in features['names'][:5]])}."
104
- return filtered, [desc]
105
- elif features.get("_type", None) in ['Translation', 'TranslationVariableLanguages']:
106
- filtered = {
107
- "feature_type": features["_type"],
108
- "dtype": "string",
109
- "languages": features["languages"],
110
- }
111
- if is_sequence:
112
- desc = f"- `{name}`: a `list` of multilingual `string` variables, with possible languages including {', '.join(['`'+nm+'`' for nm in features['languages'][:5]])}."
113
- else:
114
- desc = f"- `{name}`: a multilingual `string` variable, with possible languages including {', '.join(['`'+nm+'`' for nm in features['languages'][:5]])}."
115
- return filtered, [desc]
116
- else:
117
- filtered = {}
118
- desc = []
119
- for k, v in features.items():
120
- pre_filtered, pre_desc = filter_features(v, name=k)
121
- filtered[k] = pre_filtered
122
- desc += pre_desc
123
- return filtered, desc
124
-
125
- @st.cache
126
- def find_languages(feature_dict):
127
- if type(feature_dict) in [dict, datasets.features.Features]:
128
- languages = [l for l in feature_dict.get('languages', [])]
129
- for k, v in feature_dict.items():
130
- languages += [l for l in find_languages(v)]
131
- return languages
132
- else:
133
- return []
134
-
135
- keep_keys = ['description', 'features', 'homepage', 'license', 'splits']
136
-
137
- @st.cache(show_spinner=False)
138
- def get_info_dicts(dataset_id):
139
- module_path = datasets.load.prepare_module(dataset_id, dataset=True)
140
- builder_cls = datasets.load.import_main_class(module_path[0], dataset=True)
141
- build_confs = builder_cls.BUILDER_CONFIGS
142
- confs = [conf.name for conf in build_confs] if len(build_confs) > 0 else ['default']
143
- all_info_dicts = {}
144
- for conf in confs:
145
- builder = builder_cls(name=conf)
146
- conf_info_dict = dict([(k, v) for k, v in asdict(builder.info).items() if k in keep_keys])
147
- all_info_dicts[conf] = conf_info_dict
148
- return all_info_dicts
149
-
150
- @st.cache
151
- def get_dataset_list():
152
- return datasets.list_datasets()
153
-
154
- @st.cache(show_spinner=False)
155
- def load_all_dataset_infos(dataset_list):
156
- dataset_infos = {}
157
- for did in dataset_list:
158
- try:
159
- dataset_infos[did] = get_info_dicts(did)
160
- except:
161
- print("+++++++++++ MISSED", did)
162
- return dataset_infos
163
 
164
  def load_existing_tags():
165
  has_tags = {}
@@ -169,20 +57,35 @@ def load_existing_tags():
169
  has_tags[did][cid] = fname
170
  return has_tags
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  ########################
173
  ## Dataset selection
174
  ########################
175
 
176
- st.sidebar.markdown(
177
- """<center>
178
- <a href="https://github.com/huggingface/datasets">
179
- <img src="https://raw.githubusercontent.com/huggingface/datasets/master/docs/source/imgs/datasets_logo_name.jpg" width="200"></a>
180
- </center>""",
181
- unsafe_allow_html=True,
182
- )
183
 
184
- app_desc = """
185
- ### Dataset Tagger
 
186
 
187
  This app aims to make it easier to add structured tags to the datasets present in the library.
188
 
@@ -190,239 +93,158 @@ Each configuration requires its own tasks, as these often correspond to distinct
190
  to pre-load the tag sets from another dataset or configuration to avoid too much redundancy.
191
 
192
  The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
193
- """
194
-
195
- existing_tag_sets = load_existing_tags()
196
- all_dataset_ids = list(existing_tag_sets.keys()) if not load_remote_datasets else copy.deepcopy(get_dataset_list())
197
- all_dataset_infos = {} if not load_remote_datasets else load_all_dataset_infos(all_dataset_ids)
198
-
199
- st.sidebar.markdown(app_desc)
200
-
201
- # option to only select from datasets that still need to be annotated
202
- all_info_dicts = {}
203
- path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
204
- if path_to_info not in ["/path/to/dataset/", ""]:
205
- dataset_infos = json.load(open(pjoin(path_to_info, "dataset_infos.json")))
206
- confs = dataset_infos.keys()
207
- all_info_dicts = {}
208
- for conf, info in dataset_infos.items():
209
- conf_info_dict = dict([(k, info[k]) for k in keep_keys])
210
- all_info_dicts[conf] = conf_info_dict
211
- dataset_id = list(dataset_infos.values())[0]["builder_name"]
212
- else:
213
- dataset_id = "tmp_dir"
214
- all_info_dicts = {
215
- "default":{
216
- 'description': "",
217
- 'features': {},
218
- 'homepage': "",
219
- 'license': "",
220
- 'splits': {},
221
- }
222
- }
223
 
 
224
 
225
- config_choose_list = list(all_info_dicts.keys())
226
-
227
- config_id = st.sidebar.selectbox(
228
- label="Choose configuration",
229
- options=config_choose_list,
230
  )
231
 
232
- config_infos = all_info_dicts[config_id]
233
 
234
- c1, _, c2, _, c3 = st.beta_columns([8, 1, 12, 1, 12])
235
-
236
- ########################
237
- ## Dataset description
238
- ########################
239
-
240
- data_desc = f"### Dataset: {dataset_id} | Configuration: {config_id}" + "\n"
241
- data_desc += f"[Homepage]({config_infos['homepage']})"
242
- c1.markdown(data_desc)
 
 
 
 
 
 
 
 
 
243
 
244
- with c1.beta_expander("Dataset description:", expanded=True):
245
- st.markdown(config_infos['description'])
 
 
 
 
246
 
247
- # "pretty-fy" the features to be a little easier to read
248
- features, feature_descs = filter_features(config_infos['features'])
249
- with c1.beta_expander(f"Dataset features for config: {config_id}", expanded=False):
250
- st.write(features)
251
 
252
- ########################
253
- ## Dataset tagging
254
- ########################
255
 
256
- c2.markdown(f"### Writing tags for: {dataset_id} / {config_id}")
257
-
258
- ##########
259
- # Pre-load information to speed things up
260
- ##########
261
- c2.markdown("#### Pre-loading an existing tag set")
262
-
263
- pre_loaded = {
264
- "task_categories": [],
265
- "task_ids": [],
266
- "multilinguality": [],
267
- "languages": [],
268
- "language_creators": [],
269
- "annotations_creators": [],
270
- "source_datasets": [],
271
- "size_categories": [],
272
- "licenses": [],
273
- }
274
 
275
- if existing_tag_sets.get(dataset_id, {}).get(config_id, None) is not None:
276
- existing_tags_fname = existing_tag_sets[dataset_id][config_id]
277
- c2.markdown(f"#### Attention: this config already has a tagset saved in {existing_tags_fname}\n--- \n")
278
- if c2.checkbox("pre-load existing tag set"):
279
- pre_loaded = json.load(open(existing_tags_fname))
280
-
281
- c2.markdown("> *You may choose to pre-load the tag set of another dataset or configuration:*")
282
-
283
- with c2.beta_expander("- Choose tag set to pre-load"):
284
- did_choice_list = list(existing_tag_sets.keys())
285
- if len(existing_tag_sets) > 0:
286
- did = st.selectbox(
287
- label="Choose dataset to load tag set from",
288
- options=did_choice_list,
289
- index=did_choice_list.index(dataset_id) if dataset_id in did_choice_list else 0,
290
- )
291
- cid = st.selectbox(
292
- label="Choose config to load tag set from",
293
- options=list(existing_tag_sets[did].keys()),
294
- index=0,
295
- )
296
- if st.checkbox("pre-load this tag set"):
297
- pre_loaded = json.load(open(existing_tag_sets[did][cid]))
298
- else:
299
- st.write("There are currently no other saved tag sets.")
300
-
301
- pre_loaded["languages"] = list(set(pre_loaded["languages"] + find_languages(features)))
302
- if config_infos["license"] in license_set:
303
- pre_loaded["licenses"] = list(set(pre_loaded["licenses"] + [config_infos["license"]]))
304
-
305
- ##########
306
- # Modify or add new tags
307
- ##########
308
- c2.markdown("#### Editing the tag set")
309
- c2.markdown("> *Expand the following boxes to edit the tag set. For each of the questions, choose all that apply, at least one option:*")
310
-
311
- with c2.beta_expander("- Supported tasks", expanded=True):
312
- task_categories = st.multiselect(
313
- "What categories of task does the dataset support?",
314
- options=list(task_set.keys()),
315
- default=pre_loaded["task_categories"],
316
- format_func=lambda tg: f"{tg} : {task_set[tg]['description']}",
317
- )
318
- task_specifics = []
319
- for tg in task_categories:
320
- task_specs = st.multiselect(
321
- f"What specific *{tg}* tasks does the dataset support?",
322
- options=task_set[tg]["options"],
323
- default=[ts for ts in pre_loaded["task_ids"] if ts in task_set[tg]["options"]],
324
- )
325
- if "other" in task_specs:
326
- other_task = st.text_input(
327
- "You selected 'other' task. Please enter a short hyphen-separated description for the task:",
328
- value='my-task-description',
329
- )
330
- st.write(f"Registering {tg}-other-{other_task} task")
331
- task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
332
- task_specifics += task_specs
333
-
334
- with c2.beta_expander("- Languages", expanded=True):
335
- multilinguality = st.multiselect(
336
- "Does the dataset contain more than one language?",
337
- options=list(multilinguality_set.keys()),
338
- default=pre_loaded["multilinguality"],
339
- format_func= lambda m: f"{m} : {multilinguality_set[m]}",
340
  )
341
- if "other" in multilinguality:
342
- other_multilinguality = st.text_input(
343
- "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
344
- value='my-multilinguality',
345
  )
346
- st.write(f"Registering other-{other_multilinguality} multilinguality")
347
- multilinguality[multilinguality.index("other")] = f"other-{other_multilinguality}"
348
- languages = st.multiselect(
349
- "What languages are represented in the dataset?",
350
- options=list(language_set.keys()),
351
- default=pre_loaded["languages"],
352
- format_func= lambda m: f"{m} : {language_set[m]}",
 
 
 
 
 
 
 
 
353
  )
 
 
 
 
 
 
 
 
354
 
355
- with c2.beta_expander("- Dataset creators", expanded=True):
356
- language_creators = st.multiselect(
357
- "Where does the text in the dataset come from?",
358
- options=creator_set["language"],
359
- default=pre_loaded["language_creators"],
360
- )
361
- annotations_creators = st.multiselect(
362
- "Where do the annotations in the dataset come from?",
363
- options=creator_set["annotations"],
364
- default=pre_loaded["annotations_creators"],
365
- )
366
- licenses = st.multiselect(
367
- "What licenses is the dataset under?",
368
- options=list(license_set.keys()),
369
- default=pre_loaded["licenses"],
370
- format_func= lambda l: f"{l} : {license_set[l]}",
 
 
 
 
 
371
  )
372
- if "other" in licenses:
373
- other_license = st.text_input(
374
- "You selected 'other' type of license. Please enter a short hyphen-separated description:",
375
- value='my-license',
376
- )
377
- st.write(f"Registering other-{other_license} license")
378
- licenses[licenses.index("other")] = f"other-{other_license}"
379
- # link ro supported datasets
380
- pre_select_ext_a = []
381
- if "original" in pre_loaded["source_datasets"]:
382
- pre_select_ext_a += ["original"]
383
- if any([p.startswith("extended") for p in pre_loaded["source_datasets"]]):
384
- pre_select_ext_a += ["extended"]
385
- extended = st.multiselect(
386
- "Does the dataset contain original data and/or was it extended from other datasets?",
387
- options=["original", "extended"],
388
- default=pre_select_ext_a,
 
 
 
389
  )
390
- source_datasets = ["original"] if "original" in extended else []
391
- if "extended" in extended:
392
- pre_select_ext_b = [p.split('|')[1] for p in pre_loaded["source_datasets"] if p.startswith("extended")]
393
- extended_sources = st.multiselect(
394
- "Which other datasets does this one use data from?",
395
- options=all_dataset_ids + ["other"],
396
- default=pre_select_ext_b,
397
  )
398
- if "other" in extended_sources:
399
- other_extended_sources = st.text_input(
400
- "You selected 'other' dataset. Please enter a short hyphen-separated description:",
401
- value='my-dataset',
402
- )
403
- st.write(f"Registering other-{other_extended_sources} dataset")
404
- extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
405
- source_datasets += [f"extended|{src}" for src in extended_sources]
406
-
407
- num_examples = (
408
- sum([dct.get('num_examples', 0) for spl, dct in config_infos['splits'].items()])
409
- if config_infos.get('splits', None) is not None
410
- else -1
411
  )
412
- if num_examples < 0:
413
- size_cat = "unknown"
414
- elif num_examples < 1000:
415
- size_cat = "n<1K"
416
- elif num_examples < 10000:
417
- size_cat = "1K<n<10K"
418
- elif num_examples < 100000:
419
- size_cat = "10K<n<100K"
420
- elif num_examples < 1000000:
421
- size_cat = "100K<n<1M"
422
- else:
423
- size_cat = "n>1M"
424
 
425
- res = {
 
 
 
 
 
 
 
 
426
  "task_categories": task_categories,
427
  "task_ids": task_specifics,
428
  "multilinguality": multilinguality,
@@ -430,47 +252,9 @@ res = {
430
  "language_creators": language_creators,
431
  "annotations_creators": annotations_creators,
432
  "source_datasets": source_datasets,
433
- "size_categories": [size_cat],
434
  "licenses": licenses,
435
- }
436
-
437
- ########################
438
- ## Show results
439
- ########################
440
- c3.markdown("### Finalized tag set:")
441
-
442
- if c3.button("Done? Save to File!"):
443
- if not os.path.isdir(pjoin('saved_tags', dataset_id)):
444
- _ = os.mkdir(pjoin('saved_tags', dataset_id))
445
- if not os.path.isdir(pjoin('saved_tags', dataset_id, config_id)):
446
- _ = os.mkdir(pjoin('saved_tags', dataset_id, config_id))
447
- json.dump(res, open(pjoin('saved_tags', dataset_id, config_id, 'tags.json'), 'w'))
448
-
449
- with c3.beta_expander("Show YAML output aggregating the tags saved for all configs", expanded=False):
450
- task_saved_configs = dict([
451
- (Path(fname).parent.name, json.load(open(fname)))
452
- for fname in glob(f"saved_tags/{dataset_id}/*/tags.json")
453
- ])
454
- aggregate_config = {}
455
- for conf_name, saved_tags in task_saved_configs.items():
456
- for tag_k, tag_ls in saved_tags.items():
457
- aggregate_config[tag_k] = aggregate_config.get(tag_k, {})
458
- aggregate_config[tag_k][conf_name] = tuple(sorted(tag_ls))
459
- for tag_k in aggregate_config:
460
- if len(set(aggregate_config[tag_k].values())) == 1:
461
- aggregate_config[tag_k] = list(list(set(aggregate_config[tag_k].values()))[0])
462
- else:
463
- for conf_name in aggregate_config[tag_k]:
464
- aggregate_config[tag_k][conf_name] = list(aggregate_config[tag_k][conf_name])
465
- st.text('---\n' + yaml.dump(aggregate_config) + '---')
466
-
467
- with c3.beta_expander(f"Show Markdown Data Fields for config: {config_id}", expanded=True):
468
- st.text('\n'.join(feature_descs))
469
-
470
- with c3.beta_expander("Show JSON output for the current config"):
471
- st.write(res)
472
-
473
- c3.markdown("--- ")
474
-
475
- with c3.beta_expander("----> show full task set <----", expanded=True):
476
- st.write(task_set)
 
 
 
1
  import json
2
  import os
 
 
 
3
  from dataclasses import asdict
 
 
 
4
  from glob import glob
 
5
 
6
+ import datasets
7
+ import streamlit as st
8
+ import yaml
9
 
10
  st.set_page_config(
11
  page_title="HF Dataset Tagging App",
 
48
  ## Helper functions
49
  ########################
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  def load_existing_tags():
53
  has_tags = {}
 
57
  has_tags[did][cid] = fname
58
  return has_tags
59
 
60
+
61
+ def new_pre_loaded():
62
+ return {
63
+ "task_categories": [],
64
+ "task_ids": [],
65
+ "multilinguality": [],
66
+ "languages": [],
67
+ "language_creators": [],
68
+ "annotations_creators": [],
69
+ "source_datasets": [],
70
+ "size_categories": [],
71
+ "licenses": [],
72
+ }
73
+
74
+
75
+ pre_loaded = new_pre_loaded()
76
+
77
+ existing_tag_sets = load_existing_tags()
78
+ all_dataset_ids = list(existing_tag_sets.keys())
79
+
80
+
81
  ########################
82
  ## Dataset selection
83
  ########################
84
 
 
 
 
 
 
 
 
85
 
86
+ st.sidebar.markdown(
87
+ """
88
+ # HuggingFace Dataset Tagger
89
 
90
  This app aims to make it easier to add structured tags to the datasets present in the library.
91
 
 
93
  to pre-load the tag sets from another dataset or configuration to avoid too much redundancy.
94
 
95
  The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ ### Preloading an existing tag set
98
 
99
+ You can load an existing tag set to get started if you want.
100
+ Beware that clicking pre-load will overwrite the current state!
101
+ """
 
 
102
  )
103
 
 
104
 
105
+ qp = st.experimental_get_query_params()
106
+ preload = qp.get("preload_dataset", list())
107
+ did_index = 2
108
+ if len(preload) == 1 and preload[0] in all_dataset_ids:
109
+ did_qp, *_ = preload
110
+ cid_qp = next(iter(existing_tag_sets[did_qp]))
111
+ pre_loaded = json.load(open(existing_tag_sets[did_qp][cid_qp]))
112
+ did_index = all_dataset_ids.index(did_qp)
113
+
114
+ did = st.sidebar.selectbox(label="Choose dataset to load tag set from", options=all_dataset_ids, index=did_index)
115
+ if len(existing_tag_sets[did]) > 1:
116
+ cid = st.sidebar.selectbox(
117
+ label="Choose config to load tag set from",
118
+ options=list(existing_tag_sets[did].keys()),
119
+ index=0,
120
+ )
121
+ else:
122
+ cid = next(iter(existing_tag_sets[did].keys()))
123
 
124
+ if st.sidebar.button("pre-load this tag set"):
125
+ pre_loaded = json.load(open(existing_tag_sets[did][cid]))
126
+ st.experimental_set_query_params(preload_dataset=did)
127
+ if st.sidebar.button("flush state"):
128
+ pre_loaded = new_pre_loaded()
129
+ st.experimental_set_query_params()
130
 
131
+ leftcol, _, rightcol = st.beta_columns([12, 1, 12])
 
 
 
132
 
 
 
 
133
 
134
+ pre_loaded["languages"] = list(set(pre_loaded["languages"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
+ leftcol.markdown("### Supported tasks")
137
+ task_categories = leftcol.multiselect(
138
+ "What categories of task does the dataset support?",
139
+ options=list(task_set.keys()),
140
+ default=pre_loaded["task_categories"],
141
+ format_func=lambda tg: f"{tg} : {task_set[tg]['description']}",
142
+ )
143
+ task_specifics = []
144
+ for tg in task_categories:
145
+ task_specs = leftcol.multiselect(
146
+ f"What specific *{tg}* tasks does the dataset support?",
147
+ options=task_set[tg]["options"],
148
+ default=[ts for ts in pre_loaded["task_ids"] if ts in task_set[tg]["options"]],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  )
150
+ if "other" in task_specs:
151
+ other_task = st.text_input(
152
+ "You selected 'other' task. Please enter a short hyphen-separated description for the task:",
153
+ value="my-task-description",
154
  )
155
+ st.write(f"Registering {tg}-other-{other_task} task")
156
+ task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
157
+ task_specifics += task_specs
158
+
159
+ leftcol.markdown("### Languages")
160
+ multilinguality = leftcol.multiselect(
161
+ "Does the dataset contain more than one language?",
162
+ options=list(multilinguality_set.keys()),
163
+ default=pre_loaded["multilinguality"],
164
+ format_func=lambda m: f"{m} : {multilinguality_set[m]}",
165
+ )
166
+ if "other" in multilinguality:
167
+ other_multilinguality = st.text_input(
168
+ "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
169
+ value="my-multilinguality",
170
  )
171
+ st.write(f"Registering other-{other_multilinguality} multilinguality")
172
+ multilinguality[multilinguality.index("other")] = f"other-{other_multilinguality}"
173
+ languages = leftcol.multiselect(
174
+ "What languages are represented in the dataset?",
175
+ options=list(language_set.keys()),
176
+ default=pre_loaded["languages"],
177
+ format_func=lambda m: f"{m} : {language_set[m]}",
178
+ )
179
 
180
+ leftcol.markdown("### Dataset creators")
181
+ language_creators = leftcol.multiselect(
182
+ "Where does the text in the dataset come from?",
183
+ options=creator_set["language"],
184
+ default=pre_loaded["language_creators"],
185
+ )
186
+ annotations_creators = leftcol.multiselect(
187
+ "Where do the annotations in the dataset come from?",
188
+ options=creator_set["annotations"],
189
+ default=pre_loaded["annotations_creators"],
190
+ )
191
+ licenses = leftcol.multiselect(
192
+ "What licenses is the dataset under?",
193
+ options=list(license_set.keys()),
194
+ default=pre_loaded["licenses"],
195
+ format_func=lambda l: f"{l} : {license_set[l]}",
196
+ )
197
+ if "other" in licenses:
198
+ other_license = st.text_input(
199
+ "You selected 'other' type of license. Please enter a short hyphen-separated description:",
200
+ value="my-license",
201
  )
202
+ st.write(f"Registering other-{other_license} license")
203
+ licenses[licenses.index("other")] = f"other-{other_license}"
204
+ # link ro supported datasets
205
+ pre_select_ext_a = []
206
+ if "original" in pre_loaded["source_datasets"]:
207
+ pre_select_ext_a += ["original"]
208
+ if any([p.startswith("extended") for p in pre_loaded["source_datasets"]]):
209
+ pre_select_ext_a += ["extended"]
210
+ extended = leftcol.multiselect(
211
+ "Does the dataset contain original data and/or was it extended from other datasets?",
212
+ options=["original", "extended"],
213
+ default=pre_select_ext_a,
214
+ )
215
+ source_datasets = ["original"] if "original" in extended else []
216
+ if "extended" in extended:
217
+ pre_select_ext_b = [p.split("|")[1] for p in pre_loaded["source_datasets"] if p.startswith("extended")]
218
+ extended_sources = leftcol.multiselect(
219
+ "Which other datasets does this one use data from?",
220
+ options=all_dataset_ids + ["other"],
221
+ default=pre_select_ext_b,
222
  )
223
+ if "other" in extended_sources:
224
+ other_extended_sources = st.text_input(
225
+ "You selected 'other' dataset. Please enter a short hyphen-separated description:",
226
+ value="my-dataset",
 
 
 
227
  )
228
+ st.write(f"Registering other-{other_extended_sources} dataset")
229
+ extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
230
+ source_datasets += [f"extended|{src}" for src in extended_sources]
231
+ size_category = leftcol.selectbox(
232
+ "What is the size category of the dataset?",
233
+ options=["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"],
234
+ index=["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"].index(
235
+ (pre_loaded.get("size_categories") or ["unknown"])[0]
236
+ ),
 
 
 
 
237
  )
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
+
240
+ ########################
241
+ ## Show results
242
+ ########################
243
+ rightcol.markdown(
244
+ f"""
245
+ ### Finalized tag set
246
+ ```yaml
247
+ {yaml.dump({
248
  "task_categories": task_categories,
249
  "task_ids": task_specifics,
250
  "multilinguality": multilinguality,
 
252
  "language_creators": language_creators,
253
  "annotations_creators": annotations_creators,
254
  "source_datasets": source_datasets,
255
+ "size_categories": size_category,
256
  "licenses": licenses,
257
+ })}
258
+ ```
259
+ """
260
+ )