Spaces:
Running
Running
theo
commited on
Commit
·
c4882f0
1
Parent(s):
ef36700
rely on tagsets from datasets
Browse files- tagging_app.py +18 -45
tagging_app.py
CHANGED
@@ -5,7 +5,9 @@ from typing import Callable, Dict, List, Tuple
|
|
5 |
import langcodes as lc
|
6 |
import streamlit as st
|
7 |
import yaml
|
8 |
-
from datasets.utils.metadata import DatasetMetadata
|
|
|
|
|
9 |
|
10 |
st.set_page_config(
|
11 |
page_title="HF Dataset Tagging App",
|
@@ -26,34 +28,6 @@ st.markdown(
|
|
26 |
unsafe_allow_html=True,
|
27 |
)
|
28 |
|
29 |
-
task_set = json.load(open("task_set.json"))
|
30 |
-
license_set = json.load(open("license_set.json"))
|
31 |
-
|
32 |
-
multilinguality_set = {
|
33 |
-
"monolingual": "contains a single language",
|
34 |
-
"multilingual": "contains multiple languages",
|
35 |
-
"translation": "contains translated or aligned text",
|
36 |
-
"other": "other type of language distribution",
|
37 |
-
}
|
38 |
-
|
39 |
-
creator_set = {
|
40 |
-
"language": [
|
41 |
-
"found",
|
42 |
-
"crowdsourced",
|
43 |
-
"expert-generated",
|
44 |
-
"machine-generated",
|
45 |
-
"other",
|
46 |
-
],
|
47 |
-
"annotations": [
|
48 |
-
"found",
|
49 |
-
"crowdsourced",
|
50 |
-
"expert-generated",
|
51 |
-
"machine-generated",
|
52 |
-
"no-annotation",
|
53 |
-
"other",
|
54 |
-
],
|
55 |
-
}
|
56 |
-
|
57 |
########################
|
58 |
## Helper functions
|
59 |
########################
|
@@ -117,7 +91,7 @@ def new_state() -> Dict[str, List]:
|
|
117 |
|
118 |
|
119 |
def is_state_empty(state: Dict[str, List]) -> bool:
|
120 |
-
return sum(len(v) if v is not None else 0 for v in state.values())
|
121 |
|
122 |
|
123 |
state = new_state()
|
@@ -160,7 +134,7 @@ if leftbtn.button("pre-load"):
|
|
160 |
initial_state = existing_tag_sets[preloaded_id]
|
161 |
state = initial_state or new_state()
|
162 |
st.experimental_set_query_params(preload_dataset=preloaded_id)
|
163 |
-
if is_state_empty(state):
|
164 |
if rightbtn.button("flush state"):
|
165 |
state = new_state()
|
166 |
initial_state = None
|
@@ -195,8 +169,8 @@ state["task_categories"] = multiselect(
|
|
195 |
"Task category",
|
196 |
"What categories of task does the dataset support?",
|
197 |
values=state["task_categories"],
|
198 |
-
valid_set=list(
|
199 |
-
format_func=lambda tg: f"{tg}: {
|
200 |
)
|
201 |
task_specifics = []
|
202 |
for tg in state["task_categories"]:
|
@@ -204,8 +178,8 @@ for tg in state["task_categories"]:
|
|
204 |
leftcol,
|
205 |
f"Specific _{tg}_ tasks",
|
206 |
f"What specific tasks does the dataset support?",
|
207 |
-
values=[ts for ts in (state["task_ids"] or []) if ts in
|
208 |
-
valid_set=
|
209 |
)
|
210 |
if "other" in specs:
|
211 |
other_task = st.text_input(
|
@@ -224,8 +198,8 @@ state["multilinguality"] = multiselect(
|
|
224 |
"Monolingual?",
|
225 |
"Does the dataset contain more than one language?",
|
226 |
values=state["multilinguality"],
|
227 |
-
valid_set=list(
|
228 |
-
format_func=lambda m: f"{m} : {
|
229 |
)
|
230 |
|
231 |
if "other" in state["multilinguality"]:
|
@@ -260,14 +234,14 @@ state["language_creators"] = multiselect(
|
|
260 |
"Data origin",
|
261 |
"Where does the text in the dataset come from?",
|
262 |
values=state["language_creators"],
|
263 |
-
valid_set=
|
264 |
)
|
265 |
state["annotations_creators"] = multiselect(
|
266 |
leftcol,
|
267 |
"Annotations origin",
|
268 |
"Where do the annotations in the dataset come from?",
|
269 |
values=state["annotations_creators"],
|
270 |
-
valid_set=
|
271 |
)
|
272 |
|
273 |
|
@@ -275,9 +249,9 @@ state["licenses"] = multiselect(
|
|
275 |
leftcol,
|
276 |
"Licenses",
|
277 |
"What licenses is the dataset under?",
|
278 |
-
valid_set=list(
|
279 |
values=state["licenses"],
|
280 |
-
format_func=lambda l: f"{l} : {
|
281 |
)
|
282 |
if "other" in state["licenses"]:
|
283 |
other_license = st.text_input(
|
@@ -320,16 +294,15 @@ if "extended" in state["extended"]:
|
|
320 |
extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
|
321 |
state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
|
322 |
|
323 |
-
size_cats = ["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M", ...]
|
324 |
current_size_cats = state.get("size_categories") or ["unknown"]
|
325 |
-
ok, nonok = split_known(current_size_cats,
|
326 |
if len(nonok) > 0:
|
327 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
328 |
state["size_categories"] = [
|
329 |
leftcol.selectbox(
|
330 |
"What is the size category of the dataset?",
|
331 |
-
options=
|
332 |
-
index=
|
333 |
)
|
334 |
]
|
335 |
|
|
|
5 |
import langcodes as lc
|
6 |
import streamlit as st
|
7 |
import yaml
|
8 |
+
from datasets.utils.metadata import (DatasetMetadata, known_creators,
|
9 |
+
known_licenses, known_multilingualities,
|
10 |
+
known_size_categories, known_task_ids)
|
11 |
|
12 |
st.set_page_config(
|
13 |
page_title="HF Dataset Tagging App",
|
|
|
28 |
unsafe_allow_html=True,
|
29 |
)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
########################
|
32 |
## Helper functions
|
33 |
########################
|
|
|
91 |
|
92 |
|
93 |
def is_state_empty(state: Dict[str, List]) -> bool:
|
94 |
+
return sum(len(v) if v is not None else 0 for v in state.values()) == 0
|
95 |
|
96 |
|
97 |
state = new_state()
|
|
|
134 |
initial_state = existing_tag_sets[preloaded_id]
|
135 |
state = initial_state or new_state()
|
136 |
st.experimental_set_query_params(preload_dataset=preloaded_id)
|
137 |
+
if not is_state_empty(state):
|
138 |
if rightbtn.button("flush state"):
|
139 |
state = new_state()
|
140 |
initial_state = None
|
|
|
169 |
"Task category",
|
170 |
"What categories of task does the dataset support?",
|
171 |
values=state["task_categories"],
|
172 |
+
valid_set=list(known_task_ids.keys()),
|
173 |
+
format_func=lambda tg: f"{tg}: {known_task_ids[tg]['description']}",
|
174 |
)
|
175 |
task_specifics = []
|
176 |
for tg in state["task_categories"]:
|
|
|
178 |
leftcol,
|
179 |
f"Specific _{tg}_ tasks",
|
180 |
f"What specific tasks does the dataset support?",
|
181 |
+
values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[tg]["options"]],
|
182 |
+
valid_set=known_task_ids[tg]["options"],
|
183 |
)
|
184 |
if "other" in specs:
|
185 |
other_task = st.text_input(
|
|
|
198 |
"Monolingual?",
|
199 |
"Does the dataset contain more than one language?",
|
200 |
values=state["multilinguality"],
|
201 |
+
valid_set=list(known_multilingualities.keys()),
|
202 |
+
format_func=lambda m: f"{m} : {known_multilingualities[m]}",
|
203 |
)
|
204 |
|
205 |
if "other" in state["multilinguality"]:
|
|
|
234 |
"Data origin",
|
235 |
"Where does the text in the dataset come from?",
|
236 |
values=state["language_creators"],
|
237 |
+
valid_set=known_creators["language"],
|
238 |
)
|
239 |
state["annotations_creators"] = multiselect(
|
240 |
leftcol,
|
241 |
"Annotations origin",
|
242 |
"Where do the annotations in the dataset come from?",
|
243 |
values=state["annotations_creators"],
|
244 |
+
valid_set=known_creators["annotations"],
|
245 |
)
|
246 |
|
247 |
|
|
|
249 |
leftcol,
|
250 |
"Licenses",
|
251 |
"What licenses is the dataset under?",
|
252 |
+
valid_set=list(known_licenses.keys()),
|
253 |
values=state["licenses"],
|
254 |
+
format_func=lambda l: f"{l} : {known_licenses[l]}",
|
255 |
)
|
256 |
if "other" in state["licenses"]:
|
257 |
other_license = st.text_input(
|
|
|
294 |
extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
|
295 |
state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
|
296 |
|
|
|
297 |
current_size_cats = state.get("size_categories") or ["unknown"]
|
298 |
+
ok, nonok = split_known(current_size_cats, known_size_categories)
|
299 |
if len(nonok) > 0:
|
300 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
301 |
state["size_categories"] = [
|
302 |
leftcol.selectbox(
|
303 |
"What is the size category of the dataset?",
|
304 |
+
options=known_size_categories,
|
305 |
+
index=known_size_categories.index(ok[0]) if len(ok) > 0 else 0,
|
306 |
)
|
307 |
]
|
308 |
|