theo commited on
Commit
8b77729
·
1 Parent(s): 08a65ff

dockerfile builder + metadata builder

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .idea
build_docker_image.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+
4
+ cleanup() {
5
+ rm -f Dockerfile
6
+ }
7
+
8
+ trap cleanup ERR EXIT
9
+
10
+ cat > Dockerfile << EOF
11
+ FROM python
12
+ COPY requirements.txt .
13
+ COPY tagging_app.py .
14
+ RUN pip install -r requirements.txt
15
+ CMD ["streamlit", "run", "tagging_app.py"]
16
+ EOF
17
+
18
+ set -eEx
19
+
20
+ ./build_metadata_file.py
21
+ docker build -t dataset-tagger .
build_metadata_file.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ """ This script will clone the `datasets` repository in your current directory and parse all currently available
4
+ metadata, from the `README.md` yaml headers and the automatically generated json files.
5
+ It dumps the results in a `metadata_{current-commit-of-datasets}.json` file.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+ from subprocess import check_call, check_output
11
+ from typing import Dict
12
+
13
+ import yaml
14
+
15
+
16
+ def metadata_from_readme(f: Path) -> Dict:
17
+ with f.open() as fi:
18
+ content = [line.strip() for line in fi]
19
+
20
+ if content[0] == "---" and "---" in content[1:]:
21
+ yamlblock = "\n".join(content[1 : content[1:].index("---") + 1])
22
+ return yaml.safe_load(yamlblock) or dict()
23
+
24
+
25
+ def load_ds_datas():
26
+ drepo = Path("datasets")
27
+ if drepo.exists() and drepo.is_dir():
28
+ check_call(["git", "pull"], cwd=str((Path.cwd() / "datasets").absolute()))
29
+ else:
30
+ check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
31
+ head_sha = check_output(["git", "rev-parse", "HEAD"])
32
+
33
+ datasets_md = dict()
34
+
35
+ for ddir in sorted((drepo / "datasets").iterdir(), key=lambda d: d.name):
36
+
37
+ try:
38
+ metadata = metadata_from_readme(ddir / "README.md")
39
+ except:
40
+ metadata = None
41
+
42
+ try:
43
+ with (ddir / "dataset_infos.json").open() as fi:
44
+ infos = json.load(fi)
45
+ except:
46
+ infos = None
47
+
48
+ if metadata is not None and len(metadata) > 0:
49
+ datasets_md[ddir.name] = dict(metadata=metadata, infos=infos)
50
+ return head_sha.decode().strip(), datasets_md
51
+
52
+
53
+ if __name__ == "__main__":
54
+ head_sha, datas = load_ds_datas()
55
+ with open(f"metadata_{head_sha}.json", "w") as fi:
56
+ fi.write(json.dumps(datas))
language_set.json CHANGED
@@ -345,7 +345,6 @@
345
  "pro": "Old Proven\u00e7al (to 1500), Old Occitan (to 1500)",
346
  "ps": "Pushto, Pashto",
347
  "pt": "Portuguese",
348
- "qaa..qtz": "Private use",
349
  "qu": "Quechua",
350
  "raj": "Rajasthani",
351
  "rap": "Rapanui",
 
345
  "pro": "Old Proven\u00e7al (to 1500), Old Occitan (to 1500)",
346
  "ps": "Pushto, Pashto",
347
  "pt": "Portuguese",
 
348
  "qu": "Quechua",
349
  "raj": "Rajasthani",
350
  "rap": "Rapanui",
tagging_app.py CHANGED
@@ -1,9 +1,7 @@
1
  import json
2
- import os
3
- from dataclasses import asdict
4
- from glob import glob
5
 
6
- import datasets
7
  import streamlit as st
8
  import yaml
9
 
@@ -17,7 +15,6 @@ st.set_page_config(
17
  task_set = json.load(open("task_set.json"))
18
  license_set = json.load(open("license_set.json"))
19
  language_set_restricted = json.load(open("language_set.json"))
20
- language_set = json.load(open("language_set_full.json"))
21
 
22
  multilinguality_set = {
23
  "monolingual": "contains a single language",
@@ -49,13 +46,21 @@ creator_set = {
49
  ########################
50
 
51
 
52
- def load_existing_tags():
53
- has_tags = {}
54
- for fname in glob("saved_tags/*/*/tags.json"):
55
- _, did, cid, _ = fname.split(os.sep)
56
- has_tags[did] = has_tags.get(did, {})
57
- has_tags[did][cid] = fname
58
- return has_tags
 
 
 
 
 
 
 
 
59
 
60
 
61
  def new_pre_loaded():
@@ -73,8 +78,8 @@ def new_pre_loaded():
73
 
74
 
75
  pre_loaded = new_pre_loaded()
76
-
77
- existing_tag_sets = load_existing_tags()
78
  all_dataset_ids = list(existing_tag_sets.keys())
79
 
80
 
@@ -104,34 +109,29 @@ Beware that clicking pre-load will overwrite the current state!
104
 
105
  qp = st.experimental_get_query_params()
106
  preload = qp.get("preload_dataset", list())
107
- did_index = 2
 
108
  if len(preload) == 1 and preload[0] in all_dataset_ids:
109
- did_qp, *_ = preload
110
- cid_qp = next(iter(existing_tag_sets[did_qp]))
111
- pre_loaded = json.load(open(existing_tag_sets[did_qp][cid_qp]))
112
- did_index = all_dataset_ids.index(did_qp)
113
 
114
  did = st.sidebar.selectbox(label="Choose dataset to load tag set from", options=all_dataset_ids, index=did_index)
115
- if len(existing_tag_sets[did]) > 1:
116
- cid = st.sidebar.selectbox(
117
- label="Choose config to load tag set from",
118
- options=list(existing_tag_sets[did].keys()),
119
- index=0,
120
- )
121
- else:
122
- cid = next(iter(existing_tag_sets[did].keys()))
123
 
124
- if st.sidebar.button("pre-load this tag set"):
125
- pre_loaded = json.load(open(existing_tag_sets[did][cid]))
 
126
  st.experimental_set_query_params(preload_dataset=did)
127
- if st.sidebar.button("flush state"):
128
  pre_loaded = new_pre_loaded()
129
  st.experimental_set_query_params()
130
 
131
- leftcol, _, rightcol = st.beta_columns([12, 1, 12])
 
 
132
 
 
133
 
134
- pre_loaded["languages"] = list(set(pre_loaded["languages"]))
135
 
136
  leftcol.markdown("### Supported tasks")
137
  task_categories = leftcol.multiselect(
@@ -156,13 +156,18 @@ for tg in task_categories:
156
  task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
157
  task_specifics += task_specs
158
 
 
159
  leftcol.markdown("### Languages")
 
 
 
160
  multilinguality = leftcol.multiselect(
161
  "Does the dataset contain more than one language?",
162
  options=list(multilinguality_set.keys()),
163
  default=pre_loaded["multilinguality"],
164
  format_func=lambda m: f"{m} : {multilinguality_set[m]}",
165
  )
 
166
  if "other" in multilinguality:
167
  other_multilinguality = st.text_input(
168
  "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
@@ -170,28 +175,42 @@ if "other" in multilinguality:
170
  )
171
  st.write(f"Registering other-{other_multilinguality} multilinguality")
172
  multilinguality[multilinguality.index("other")] = f"other-{other_multilinguality}"
 
 
 
173
  languages = leftcol.multiselect(
174
  "What languages are represented in the dataset?",
175
- options=list(language_set.keys()),
176
  default=pre_loaded["languages"],
177
- format_func=lambda m: f"{m} : {language_set[m]}",
178
  )
179
 
 
180
  leftcol.markdown("### Dataset creators")
 
 
 
181
  language_creators = leftcol.multiselect(
182
  "Where does the text in the dataset come from?",
183
  options=creator_set["language"],
184
- default=pre_loaded["language_creators"],
185
  )
 
 
 
186
  annotations_creators = leftcol.multiselect(
187
  "Where do the annotations in the dataset come from?",
188
  options=creator_set["annotations"],
189
- default=pre_loaded["annotations_creators"],
190
  )
 
 
 
 
191
  licenses = leftcol.multiselect(
192
  "What licenses is the dataset under?",
193
  options=list(license_set.keys()),
194
- default=pre_loaded["licenses"],
195
  format_func=lambda l: f"{l} : {license_set[l]}",
196
  )
197
  if "other" in licenses:
@@ -228,33 +247,42 @@ if "extended" in extended:
228
  st.write(f"Registering other-{other_extended_sources} dataset")
229
  extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
230
  source_datasets += [f"extended|{src}" for src in extended_sources]
 
 
 
 
 
 
231
  size_category = leftcol.selectbox(
232
  "What is the size category of the dataset?",
233
- options=["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"],
234
- index=["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"].index(
235
- (pre_loaded.get("size_categories") or ["unknown"])[0]
236
- ),
237
  )
238
 
239
 
240
  ########################
241
  ## Show results
242
  ########################
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  rightcol.markdown(
244
  f"""
245
  ### Finalized tag set
 
 
 
246
  ```yaml
247
- {yaml.dump({
248
- "task_categories": task_categories,
249
- "task_ids": task_specifics,
250
- "multilinguality": multilinguality,
251
- "languages": languages,
252
- "language_creators": language_creators,
253
- "annotations_creators": annotations_creators,
254
- "source_datasets": source_datasets,
255
- "size_categories": size_category,
256
- "licenses": licenses,
257
- })}
258
- ```
259
- """
260
  )
 
1
  import json
2
+ from pathlib import Path
3
+ from typing import List, Tuple
 
4
 
 
5
  import streamlit as st
6
  import yaml
7
 
 
15
  task_set = json.load(open("task_set.json"))
16
  license_set = json.load(open("license_set.json"))
17
  language_set_restricted = json.load(open("language_set.json"))
 
18
 
19
  multilinguality_set = {
20
  "monolingual": "contains a single language",
 
46
  ########################
47
 
48
 
49
+ @st.cache(allow_output_mutation=True)
50
+ def load_ds_datas():
51
+ metada_exports = sorted(
52
+ [f for f in Path.cwd().iterdir() if f.name.startswith("metadata_")],
53
+ key=lambda f: f.lstat().st_mtime,
54
+ reverse=True,
55
+ )
56
+ if len(metada_exports) == 0:
57
+ raise ValueError("need to run ./build_metada_file.py at least once")
58
+ with metada_exports[0].open() as fi:
59
+ return json.load(fi)
60
+
61
+
62
+ def split_known(vals: List[str], okset: List[str]) -> Tuple[List[str], List[str]]:
63
+ return [v for v in vals if v in okset], [v for v in vals if v not in okset]
64
 
65
 
66
  def new_pre_loaded():
 
78
 
79
 
80
  pre_loaded = new_pre_loaded()
81
+ datasets_md = load_ds_datas()
82
+ existing_tag_sets = {name: mds["metadata"] for name, mds in datasets_md.items()}
83
  all_dataset_ids = list(existing_tag_sets.keys())
84
 
85
 
 
109
 
110
  qp = st.experimental_get_query_params()
111
  preload = qp.get("preload_dataset", list())
112
+ preloaded_id = None
113
+ did_index = 0
114
  if len(preload) == 1 and preload[0] in all_dataset_ids:
115
+ preloaded_id, *_ = preload
116
+ pre_loaded = existing_tag_sets[preloaded_id] or new_pre_loaded()
117
+ did_index = all_dataset_ids.index(preloaded_id)
 
118
 
119
  did = st.sidebar.selectbox(label="Choose dataset to load tag set from", options=all_dataset_ids, index=did_index)
 
 
 
 
 
 
 
 
120
 
121
+ leftbtn, rightbtn = st.sidebar.beta_columns(2)
122
+ if leftbtn.button("pre-load tagset"):
123
+ pre_loaded = existing_tag_sets[did] or new_pre_loaded()
124
  st.experimental_set_query_params(preload_dataset=did)
125
+ if rightbtn.button("flush state"):
126
  pre_loaded = new_pre_loaded()
127
  st.experimental_set_query_params()
128
 
129
+ if preloaded_id is not None:
130
+ st.sidebar.markdown(f"Took [{preloaded_id}](https://huggingface.co/datasets/{preloaded_id}) as base tagset.")
131
+
132
 
133
+ leftcol, _, rightcol = st.beta_columns([12, 1, 12])
134
 
 
135
 
136
  leftcol.markdown("### Supported tasks")
137
  task_categories = leftcol.multiselect(
 
156
  task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
157
  task_specifics += task_specs
158
 
159
+
160
  leftcol.markdown("### Languages")
161
+ filtered_existing_languages = [lgc for lgc in set(pre_loaded["languages"]) if lgc not in language_set_restricted]
162
+ pre_loaded["languages"] = [lgc for lgc in set(pre_loaded["languages"]) if lgc in language_set_restricted]
163
+
164
  multilinguality = leftcol.multiselect(
165
  "Does the dataset contain more than one language?",
166
  options=list(multilinguality_set.keys()),
167
  default=pre_loaded["multilinguality"],
168
  format_func=lambda m: f"{m} : {multilinguality_set[m]}",
169
  )
170
+
171
  if "other" in multilinguality:
172
  other_multilinguality = st.text_input(
173
  "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
 
175
  )
176
  st.write(f"Registering other-{other_multilinguality} multilinguality")
177
  multilinguality[multilinguality.index("other")] = f"other-{other_multilinguality}"
178
+
179
+ if len(filtered_existing_languages) > 0:
180
+ leftcol.markdown(f"**Found bad language codes in existing tagset**:\n{filtered_existing_languages}")
181
  languages = leftcol.multiselect(
182
  "What languages are represented in the dataset?",
183
+ options=list(language_set_restricted.keys()),
184
  default=pre_loaded["languages"],
185
+ format_func=lambda m: f"{m} : {language_set_restricted[m]}",
186
  )
187
 
188
+
189
  leftcol.markdown("### Dataset creators")
190
+ ok, nonok = split_known(pre_loaded["language_creators"], creator_set["language"])
191
+ if len(nonok) > 0:
192
+ leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
193
  language_creators = leftcol.multiselect(
194
  "Where does the text in the dataset come from?",
195
  options=creator_set["language"],
196
+ default=ok,
197
  )
198
+ ok, nonok = split_known(pre_loaded["annotations_creators"], creator_set["annotations"])
199
+ if len(nonok) > 0:
200
+ leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
201
  annotations_creators = leftcol.multiselect(
202
  "Where do the annotations in the dataset come from?",
203
  options=creator_set["annotations"],
204
+ default=ok,
205
  )
206
+
207
+ ok, nonok = split_known(pre_loaded["licenses"], list(license_set.keys()))
208
+ if len(nonok) > 0:
209
+ leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
210
  licenses = leftcol.multiselect(
211
  "What licenses is the dataset under?",
212
  options=list(license_set.keys()),
213
+ default=ok,
214
  format_func=lambda l: f"{l} : {license_set[l]}",
215
  )
216
  if "other" in licenses:
 
247
  st.write(f"Registering other-{other_extended_sources} dataset")
248
  extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
249
  source_datasets += [f"extended|{src}" for src in extended_sources]
250
+
251
+ size_cats = ["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"]
252
+ current_size_cats = pre_loaded.get("size_categories") or ["unknown"]
253
+ ok, nonok = split_known(current_size_cats, size_cats)
254
+ if len(nonok) > 0:
255
+ leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
256
  size_category = leftcol.selectbox(
257
  "What is the size category of the dataset?",
258
+ options=size_cats,
259
+ index=size_cats.index(ok[0]) if len(ok) > 0 else 0,
 
 
260
  )
261
 
262
 
263
  ########################
264
  ## Show results
265
  ########################
266
+ yamlblock = yaml.dump(
267
+ {
268
+ "task_categories": task_categories,
269
+ "task_ids": task_specifics,
270
+ "multilinguality": multilinguality,
271
+ "languages": languages,
272
+ "language_creators": language_creators,
273
+ "annotations_creators": annotations_creators,
274
+ "source_datasets": source_datasets,
275
+ "size_categories": size_category,
276
+ "licenses": licenses,
277
+ }
278
+ )
279
  rightcol.markdown(
280
  f"""
281
  ### Finalized tag set
282
+
283
+ Copy it into your dataset's `README.md` header! 🤗
284
+
285
  ```yaml
286
+ {yamlblock}
287
+ ```""",
 
 
 
 
 
 
 
 
 
 
 
288
  )