yacine commited on
Commit
cca065e
·
1 Parent(s): be293db

expanded tag sets

Browse files
Files changed (6) hide show
  1. language_set.json +478 -0
  2. language_set_full.json +0 -0
  3. license_set.json +452 -0
  4. tag_set.json +0 -1
  5. tagging_app.py +19 -144
  6. task_set.json +84 -0
language_set.json ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aa": "Afar",
3
+ "ab": "Abkhazian",
4
+ "ace": "Achinese",
5
+ "ach": "Acoli",
6
+ "ada": "Adangme",
7
+ "ady": "Adyghe, Adygei",
8
+ "ae": "Avestan",
9
+ "af": "Afrikaans",
10
+ "afa": "Afro-Asiatic languages",
11
+ "afh": "Afrihili",
12
+ "ain": "Ainu (Japan)",
13
+ "ak": "Akan",
14
+ "akk": "Akkadian",
15
+ "ale": "Aleut",
16
+ "alg": "Algonquian languages",
17
+ "alt": "Southern Altai",
18
+ "am": "Amharic",
19
+ "an": "Aragonese",
20
+ "ang": "Old English (ca. 450-1100)",
21
+ "apa": "Apache languages",
22
+ "ar": "Arabic",
23
+ "arc": "Official Aramaic (700-300 BCE), Imperial Aramaic (700-300 BCE)",
24
+ "arn": "Mapudungun, Mapuche",
25
+ "arp": "Arapaho",
26
+ "art": "Artificial languages",
27
+ "arw": "Arawak",
28
+ "as": "Assamese",
29
+ "ast": "Asturian, Asturleonese, Bable, Leonese",
30
+ "ath": "Athapascan languages",
31
+ "aus": "Australian languages",
32
+ "av": "Avaric",
33
+ "awa": "Awadhi",
34
+ "ay": "Aymara",
35
+ "az": "Azerbaijani",
36
+ "ba": "Bashkir",
37
+ "bad": "Banda languages",
38
+ "bai": "Bamileke languages",
39
+ "bal": "Baluchi",
40
+ "ban": "Balinese",
41
+ "bas": "Basa (Cameroon)",
42
+ "bat": "Baltic languages",
43
+ "be": "Belarusian",
44
+ "bej": "Beja, Bedawiyet",
45
+ "bem": "Bemba (Zambia)",
46
+ "ber": "Berber languages",
47
+ "bg": "Bulgarian",
48
+ "bh": "Bihari languages",
49
+ "bho": "Bhojpuri",
50
+ "bi": "Bislama",
51
+ "bik": "Bikol",
52
+ "bin": "Bini, Edo",
53
+ "bla": "Siksika",
54
+ "bm": "Bambara",
55
+ "bn": "Bengali, Bangla",
56
+ "bnt": "Bantu languages",
57
+ "bo": "Tibetan",
58
+ "br": "Breton",
59
+ "bra": "Braj",
60
+ "bs": "Bosnian",
61
+ "btk": "Batak languages",
62
+ "bua": "Buriat",
63
+ "bug": "Buginese",
64
+ "byn": "Bilin, Blin",
65
+ "ca": "Catalan, Valencian",
66
+ "cad": "Caddo",
67
+ "cai": "Central American Indian languages",
68
+ "car": "Galibi Carib",
69
+ "cau": "Caucasian languages",
70
+ "ce": "Chechen",
71
+ "ceb": "Cebuano",
72
+ "cel": "Celtic languages",
73
+ "ch": "Chamorro",
74
+ "chb": "Chibcha",
75
+ "chg": "Chagatai",
76
+ "chk": "Chuukese",
77
+ "chm": "Mari (Russia)",
78
+ "chn": "Chinook jargon",
79
+ "cho": "Choctaw",
80
+ "chp": "Chipewyan, Dene Suline",
81
+ "chr": "Cherokee",
82
+ "chy": "Cheyenne",
83
+ "cmc": "Chamic languages",
84
+ "co": "Corsican",
85
+ "cop": "Coptic",
86
+ "cpe": "English-based creoles and pidgins",
87
+ "cpf": "French-based creoles and pidgins",
88
+ "cpp": "Portuguese-based creoles and pidgins",
89
+ "cr": "Cree",
90
+ "crh": "Crimean Tatar, Crimean Turkish",
91
+ "crp": "Creoles and pidgins",
92
+ "cs": "Czech",
93
+ "csb": "Kashubian",
94
+ "cu": "Church Slavic, Church Slavonic, Old Bulgarian, Old Church Slavonic, Old Slavonic",
95
+ "cus": "Cushitic languages",
96
+ "cv": "Chuvash",
97
+ "cy": "Welsh",
98
+ "da": "Danish",
99
+ "dak": "Dakota",
100
+ "dar": "Dargwa",
101
+ "day": "Land Dayak languages",
102
+ "de": "German",
103
+ "del": "Delaware",
104
+ "den": "Slave (Athapascan)",
105
+ "dgr": "Dogrib, T\u0142\u0131\u0328ch\u01eb",
106
+ "din": "Dinka",
107
+ "doi": "Dogri (macrolanguage)",
108
+ "dra": "Dravidian languages",
109
+ "dsb": "Lower Sorbian",
110
+ "dua": "Duala",
111
+ "dum": "Middle Dutch (ca. 1050-1350)",
112
+ "dv": "Dhivehi, Divehi, Maldivian",
113
+ "dyu": "Dyula",
114
+ "dz": "Dzongkha",
115
+ "ee": "Ewe",
116
+ "efi": "Efik",
117
+ "egy": "Egyptian (Ancient)",
118
+ "eka": "Ekajuk",
119
+ "el": "Modern Greek (1453-)",
120
+ "elx": "Elamite",
121
+ "en": "English",
122
+ "enm": "Middle English (1100-1500)",
123
+ "eo": "Esperanto",
124
+ "es": "Spanish, Castilian",
125
+ "et": "Estonian",
126
+ "eu": "Basque",
127
+ "ewo": "Ewondo",
128
+ "fa": "Persian",
129
+ "fan": "Fang (Equatorial Guinea)",
130
+ "fat": "Fanti",
131
+ "ff": "Fulah",
132
+ "fi": "Finnish",
133
+ "fil": "Filipino, Pilipino",
134
+ "fiu": "Finno-Ugrian languages",
135
+ "fj": "Fijian",
136
+ "fo": "Faroese",
137
+ "fon": "Fon",
138
+ "fr": "French",
139
+ "frm": "Middle French (ca. 1400-1600)",
140
+ "fro": "Old French (842-ca. 1400)",
141
+ "fur": "Friulian",
142
+ "fy": "Western Frisian",
143
+ "ga": "Irish",
144
+ "gaa": "Ga",
145
+ "gay": "Gayo",
146
+ "gba": "Gbaya (Central African Republic)",
147
+ "gd": "Scottish Gaelic, Gaelic",
148
+ "gem": "Germanic languages",
149
+ "gez": "Geez",
150
+ "gil": "Gilbertese",
151
+ "gl": "Galician",
152
+ "gmh": "Middle High German (ca. 1050-1500)",
153
+ "gn": "Guarani",
154
+ "goh": "Old High German (ca. 750-1050)",
155
+ "gon": "Gondi",
156
+ "gor": "Gorontalo",
157
+ "got": "Gothic",
158
+ "grb": "Grebo",
159
+ "grc": "Ancient Greek (to 1453)",
160
+ "gu": "Gujarati",
161
+ "gv": "Manx",
162
+ "gwi": "Gwich\u02bcin",
163
+ "ha": "Hausa",
164
+ "hai": "Haida",
165
+ "haw": "Hawaiian",
166
+ "he": "Hebrew",
167
+ "hi": "Hindi",
168
+ "hil": "Hiligaynon",
169
+ "him": "Himachali languages, Western Pahari languages",
170
+ "hit": "Hittite",
171
+ "hmn": "Hmong, Mong",
172
+ "ho": "Hiri Motu",
173
+ "hr": "Croatian",
174
+ "hsb": "Upper Sorbian",
175
+ "ht": "Haitian, Haitian Creole",
176
+ "hu": "Hungarian",
177
+ "hup": "Hupa",
178
+ "hy": "Armenian",
179
+ "hz": "Herero",
180
+ "ia": "Interlingua (International Auxiliary Language Association)",
181
+ "iba": "Iban",
182
+ "id": "Indonesian",
183
+ "ie": "Interlingue, Occidental",
184
+ "ig": "Igbo",
185
+ "ii": "Sichuan Yi, Nuosu",
186
+ "ijo": "Ijo languages",
187
+ "ik": "Inupiaq",
188
+ "ilo": "Iloko",
189
+ "inc": "Indic languages",
190
+ "ine": "Indo-European languages",
191
+ "inh": "Ingush",
192
+ "io": "Ido",
193
+ "ira": "Iranian languages",
194
+ "iro": "Iroquoian languages",
195
+ "is": "Icelandic",
196
+ "it": "Italian",
197
+ "iu": "Inuktitut",
198
+ "ja": "Japanese",
199
+ "jbo": "Lojban",
200
+ "jpr": "Judeo-Persian",
201
+ "jrb": "Judeo-Arabic",
202
+ "jv": "Javanese",
203
+ "ka": "Georgian",
204
+ "kaa": "Kara-Kalpak, Karakalpak",
205
+ "kab": "Kabyle",
206
+ "kac": "Kachin, Jingpho",
207
+ "kam": "Kamba (Kenya)",
208
+ "kar": "Karen languages",
209
+ "kaw": "Kawi",
210
+ "kbd": "Kabardian",
211
+ "kg": "Kongo",
212
+ "kha": "Khasi",
213
+ "khi": "Khoisan languages",
214
+ "kho": "Khotanese, Sakan",
215
+ "ki": "Kikuyu, Gikuyu",
216
+ "kj": "Kuanyama, Kwanyama",
217
+ "kk": "Kazakh",
218
+ "kl": "Kalaallisut, Greenlandic",
219
+ "km": "Khmer, Central Khmer",
220
+ "kmb": "Kimbundu",
221
+ "kn": "Kannada",
222
+ "ko": "Korean",
223
+ "kok": "Konkani (macrolanguage)",
224
+ "kos": "Kosraean",
225
+ "kpe": "Kpelle",
226
+ "kr": "Kanuri",
227
+ "krc": "Karachay-Balkar",
228
+ "kro": "Kru languages",
229
+ "kru": "Kurukh",
230
+ "ks": "Kashmiri",
231
+ "ku": "Kurdish",
232
+ "kum": "Kumyk",
233
+ "kut": "Kutenai",
234
+ "kv": "Komi",
235
+ "kw": "Cornish",
236
+ "ky": "Kirghiz, Kyrgyz",
237
+ "la": "Latin",
238
+ "lad": "Ladino",
239
+ "lah": "Lahnda",
240
+ "lam": "Lamba",
241
+ "lb": "Luxembourgish, Letzeburgesch",
242
+ "lez": "Lezghian",
243
+ "lg": "Ganda, Luganda",
244
+ "li": "Limburgan, Limburger, Limburgish",
245
+ "ln": "Lingala",
246
+ "lo": "Lao",
247
+ "lol": "Mongo",
248
+ "loz": "Lozi",
249
+ "lt": "Lithuanian",
250
+ "lu": "Luba-Katanga",
251
+ "lua": "Luba-Lulua",
252
+ "lui": "Luiseno",
253
+ "lun": "Lunda",
254
+ "luo": "Luo (Kenya and Tanzania), Dholuo",
255
+ "lus": "Lushai",
256
+ "lv": "Latvian",
257
+ "mad": "Madurese",
258
+ "mag": "Magahi",
259
+ "mai": "Maithili",
260
+ "mak": "Makasar",
261
+ "man": "Mandingo, Manding",
262
+ "map": "Austronesian languages",
263
+ "mas": "Masai",
264
+ "mdf": "Moksha",
265
+ "mdr": "Mandar",
266
+ "men": "Mende (Sierra Leone)",
267
+ "mg": "Malagasy",
268
+ "mga": "Middle Irish (900-1200)",
269
+ "mh": "Marshallese",
270
+ "mi": "Maori",
271
+ "mic": "Mi'kmaq, Micmac",
272
+ "min": "Minangkabau",
273
+ "mis": "Uncoded languages",
274
+ "mk": "Macedonian",
275
+ "mkh": "Mon-Khmer languages",
276
+ "ml": "Malayalam",
277
+ "mn": "Mongolian",
278
+ "mnc": "Manchu",
279
+ "mni": "Manipuri",
280
+ "mno": "Manobo languages",
281
+ "moh": "Mohawk",
282
+ "mos": "Mossi",
283
+ "mr": "Marathi",
284
+ "ms": "Malay (macrolanguage)",
285
+ "mt": "Maltese",
286
+ "mul": "Multiple languages",
287
+ "mun": "Munda languages",
288
+ "mus": "Creek",
289
+ "mwl": "Mirandese",
290
+ "mwr": "Marwari",
291
+ "my": "Burmese",
292
+ "myn": "Mayan languages",
293
+ "myv": "Erzya",
294
+ "na": "Nauru",
295
+ "nah": "Nahuatl languages",
296
+ "nai": "North American Indian languages",
297
+ "nap": "Neapolitan",
298
+ "nb": "Norwegian Bokm\u00e5l",
299
+ "nd": "North Ndebele",
300
+ "nds": "Low German, Low Saxon",
301
+ "ne": "Nepali (macrolanguage)",
302
+ "new": "Newari, Nepal Bhasa",
303
+ "ng": "Ndonga",
304
+ "nia": "Nias",
305
+ "nic": "Niger-Kordofanian languages",
306
+ "niu": "Niuean",
307
+ "nl": "Dutch, Flemish",
308
+ "nn": "Norwegian Nynorsk",
309
+ "no": "Norwegian",
310
+ "nog": "Nogai",
311
+ "non": "Old Norse",
312
+ "nr": "South Ndebele",
313
+ "nso": "Pedi, Northern Sotho, Sepedi",
314
+ "nub": "Nubian languages",
315
+ "nv": "Navajo, Navaho",
316
+ "nwc": "Classical Newari, Classical Nepal Bhasa, Old Newari",
317
+ "ny": "Nyanja, Chewa, Chichewa",
318
+ "nym": "Nyamwezi",
319
+ "nyn": "Nyankole",
320
+ "nyo": "Nyoro",
321
+ "nzi": "Nzima",
322
+ "oc": "Occitan (post 1500)",
323
+ "oj": "Ojibwa",
324
+ "om": "Oromo",
325
+ "or": "Oriya (macrolanguage), Odia (macrolanguage)",
326
+ "os": "Ossetian, Ossetic",
327
+ "osa": "Osage",
328
+ "ota": "Ottoman Turkish (1500-1928)",
329
+ "oto": "Otomian languages",
330
+ "pa": "Panjabi, Punjabi",
331
+ "paa": "Papuan languages",
332
+ "pag": "Pangasinan",
333
+ "pal": "Pahlavi",
334
+ "pam": "Pampanga, Kapampangan",
335
+ "pap": "Papiamento",
336
+ "pau": "Palauan",
337
+ "peo": "Old Persian (ca. 600-400 B.C.)",
338
+ "phi": "Philippine languages",
339
+ "phn": "Phoenician",
340
+ "pi": "Pali",
341
+ "pl": "Polish",
342
+ "pon": "Pohnpeian",
343
+ "pra": "Prakrit languages",
344
+ "pro": "Old Proven\u00e7al (to 1500), Old Occitan (to 1500)",
345
+ "ps": "Pushto, Pashto",
346
+ "pt": "Portuguese",
347
+ "qaa..qtz": "Private use",
348
+ "qu": "Quechua",
349
+ "raj": "Rajasthani",
350
+ "rap": "Rapanui",
351
+ "rar": "Rarotongan, Cook Islands Maori",
352
+ "rm": "Romansh",
353
+ "rn": "Rundi",
354
+ "ro": "Romanian, Moldavian, Moldovan",
355
+ "roa": "Romance languages",
356
+ "rom": "Romany",
357
+ "ru": "Russian",
358
+ "rup": "Macedo-Romanian, Aromanian, Arumanian",
359
+ "rw": "Kinyarwanda",
360
+ "sa": "Sanskrit",
361
+ "sad": "Sandawe",
362
+ "sah": "Yakut",
363
+ "sai": "South American Indian languages",
364
+ "sal": "Salishan languages",
365
+ "sam": "Samaritan Aramaic",
366
+ "sas": "Sasak",
367
+ "sat": "Santali",
368
+ "sc": "Sardinian",
369
+ "scn": "Sicilian",
370
+ "sco": "Scots",
371
+ "sd": "Sindhi",
372
+ "se": "Northern Sami",
373
+ "sel": "Selkup",
374
+ "sem": "Semitic languages",
375
+ "sg": "Sango",
376
+ "sga": "Old Irish (to 900)",
377
+ "sgn": "Sign languages",
378
+ "sh": "Serbo-Croatian",
379
+ "shn": "Shan",
380
+ "si": "Sinhala, Sinhalese",
381
+ "sid": "Sidamo",
382
+ "sio": "Siouan languages",
383
+ "sit": "Sino-Tibetan languages",
384
+ "sk": "Slovak",
385
+ "sl": "Slovenian",
386
+ "sla": "Slavic languages",
387
+ "sm": "Samoan",
388
+ "sma": "Southern Sami",
389
+ "smi": "Sami languages",
390
+ "smj": "Lule Sami",
391
+ "smn": "Inari Sami",
392
+ "sms": "Skolt Sami",
393
+ "sn": "Shona",
394
+ "snk": "Soninke",
395
+ "so": "Somali",
396
+ "sog": "Sogdian",
397
+ "son": "Songhai languages",
398
+ "sq": "Albanian",
399
+ "sr": "Serbian",
400
+ "srn": "Sranan Tongo",
401
+ "srr": "Serer",
402
+ "ss": "Swati",
403
+ "ssa": "Nilo-Saharan languages",
404
+ "st": "Southern Sotho",
405
+ "su": "Sundanese",
406
+ "suk": "Sukuma",
407
+ "sus": "Susu",
408
+ "sux": "Sumerian",
409
+ "sv": "Swedish",
410
+ "sw": "Swahili (macrolanguage)",
411
+ "syr": "Syriac",
412
+ "ta": "Tamil",
413
+ "tai": "Tai languages",
414
+ "te": "Telugu",
415
+ "tem": "Timne",
416
+ "ter": "Tereno",
417
+ "tet": "Tetum",
418
+ "tg": "Tajik",
419
+ "th": "Thai",
420
+ "ti": "Tigrinya",
421
+ "tig": "Tigre",
422
+ "tiv": "Tiv",
423
+ "tk": "Turkmen",
424
+ "tkl": "Tokelau",
425
+ "tl": "Tagalog",
426
+ "tlh": "Klingon, tlhIngan Hol",
427
+ "tli": "Tlingit",
428
+ "tmh": "Tamashek",
429
+ "tn": "Tswana",
430
+ "to": "Tonga (Tonga Islands)",
431
+ "tog": "Tonga (Nyasa)",
432
+ "tpi": "Tok Pisin",
433
+ "tr": "Turkish",
434
+ "ts": "Tsonga",
435
+ "tsi": "Tsimshian",
436
+ "tt": "Tatar",
437
+ "tum": "Tumbuka",
438
+ "tup": "Tupi languages",
439
+ "tut": "Altaic languages",
440
+ "tvl": "Tuvalu",
441
+ "tw": "Twi",
442
+ "ty": "Tahitian",
443
+ "tyv": "Tuvinian",
444
+ "udm": "Udmurt",
445
+ "ug": "Uighur, Uyghur",
446
+ "uga": "Ugaritic",
447
+ "uk": "Ukrainian",
448
+ "umb": "Umbundu",
449
+ "und": "Undetermined",
450
+ "ur": "Urdu",
451
+ "uz": "Uzbek",
452
+ "vai": "Vai",
453
+ "ve": "Venda",
454
+ "vi": "Vietnamese",
455
+ "vo": "Volap\u00fck",
456
+ "vot": "Votic",
457
+ "wa": "Walloon",
458
+ "wak": "Wakashan languages",
459
+ "wal": "Wolaytta, Wolaitta",
460
+ "war": "Waray (Philippines)",
461
+ "was": "Washo",
462
+ "wen": "Sorbian languages",
463
+ "wo": "Wolof",
464
+ "xal": "Kalmyk, Oirat",
465
+ "xh": "Xhosa",
466
+ "yao": "Yao",
467
+ "yap": "Yapese",
468
+ "yi": "Yiddish",
469
+ "yo": "Yoruba",
470
+ "ypk": "Yupik languages",
471
+ "za": "Zhuang, Chuang",
472
+ "zap": "Zapotec",
473
+ "zen": "Zenaga",
474
+ "zh": "Chinese",
475
+ "znd": "Zande languages",
476
+ "zu": "Zulu",
477
+ "zun": "Zuni"
478
+ }
language_set_full.json ADDED
The diff for this file is too large to render. See raw diff
 
license_set.json ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "other": "Other license",
3
+ "unknown": "License information unavailable",
4
+ "0bsd": "BSD Zero Clause License",
5
+ "aal": "Attribution Assurance License",
6
+ "abstyles": "Abstyles License",
7
+ "adobe-2006": "Adobe Systems Incorporated Source Code License Agreement",
8
+ "adobe-glyph": "Adobe Glyph List License",
9
+ "adsl": "Amazon Digital Services License",
10
+ "afl-1.1": "Academic Free License v1.1",
11
+ "afl-1.2": "Academic Free License v1.2",
12
+ "afl-2.0": "Academic Free License v2.0",
13
+ "afl-2.1": "Academic Free License v2.1",
14
+ "afl-3.0": "Academic Free License v3.0",
15
+ "afmparse": "Afmparse License",
16
+ "agpl-1.0": "Affero General Public License v1.0",
17
+ "agpl-1.0-only": "Affero General Public License v1.0 only",
18
+ "agpl-1.0-or-later": "Affero General Public License v1.0 or later",
19
+ "agpl-3.0": "GNU Affero General Public License v3.0",
20
+ "agpl-3.0-only": "GNU Affero General Public License v3.0 only",
21
+ "agpl-3.0-or-later": "GNU Affero General Public License v3.0 or later",
22
+ "aladdin": "Aladdin Free Public License",
23
+ "amdplpa": "AMD's plpa_map.c License",
24
+ "aml": "Apple MIT License",
25
+ "ampas": "Academy of Motion Picture Arts and Sciences BSD",
26
+ "antlr-pd": "ANTLR Software Rights Notice",
27
+ "antlr-pd-fallback": "ANTLR Software Rights Notice with license fallback",
28
+ "apache-1.0": "Apache License 1.0",
29
+ "apache-1.1": "Apache License 1.1",
30
+ "apache-2.0": "Apache License 2.0",
31
+ "apafml": "Adobe Postscript AFM License",
32
+ "apl-1.0": "Adaptive Public License 1.0",
33
+ "apsl-1.0": "Apple Public Source License 1.0",
34
+ "apsl-1.1": "Apple Public Source License 1.1",
35
+ "apsl-1.2": "Apple Public Source License 1.2",
36
+ "apsl-2.0": "Apple Public Source License 2.0",
37
+ "artistic-1.0": "Artistic License 1.0",
38
+ "artistic-1.0-cl8": "Artistic License 1.0 w/clause 8",
39
+ "artistic-1.0-perl": "Artistic License 1.0 (Perl)",
40
+ "artistic-2.0": "Artistic License 2.0",
41
+ "bahyph": "Bahyph License",
42
+ "barr": "Barr License",
43
+ "beerware": "Beerware License",
44
+ "bittorrent-1.0": "BitTorrent Open Source License v1.0",
45
+ "bittorrent-1.1": "BitTorrent Open Source License v1.1",
46
+ "blessing": "SQLite Blessing",
47
+ "blueoak-1.0.0": "Blue Oak Model License 1.0.0",
48
+ "borceux": "Borceux license",
49
+ "bsd-1-clause": "BSD 1-Clause License",
50
+ "bsd-2-clause": "BSD 2-Clause \"Simplified\" License",
51
+ "bsd-2-clause-freebsd": "BSD 2-Clause FreeBSD License",
52
+ "bsd-2-clause-netbsd": "BSD 2-Clause NetBSD License",
53
+ "bsd-2-clause-patent": "BSD-2-Clause Plus Patent License",
54
+ "bsd-2-clause-views": "BSD 2-Clause with views sentence",
55
+ "bsd-3-clause": "BSD 3-Clause \"New\" or \"Revised\" License",
56
+ "bsd-3-clause-attribution": "BSD with attribution",
57
+ "bsd-3-clause-clear": "BSD 3-Clause Clear License",
58
+ "bsd-3-clause-lbnl": "Lawrence Berkeley National Labs BSD variant license",
59
+ "bsd-3-clause-no-nuclear-license": "BSD 3-Clause No Nuclear License",
60
+ "bsd-3-clause-no-nuclear-license-2014": "BSD 3-Clause No Nuclear License 2014",
61
+ "bsd-3-clause-no-nuclear-warranty": "BSD 3-Clause No Nuclear Warranty",
62
+ "bsd-3-clause-open-mpi": "BSD 3-Clause Open MPI variant",
63
+ "bsd-4-clause": "BSD 4-Clause \"Original\" or \"Old\" License",
64
+ "bsd-4-clause-uc": "BSD-4-Clause (University of California-Specific)",
65
+ "bsd-protection": "BSD Protection License",
66
+ "bsd-source-code": "BSD Source Code Attribution",
67
+ "bsl-1.0": "Boost Software License 1.0",
68
+ "busl-1.1": "Business Source License 1.1",
69
+ "bzip2-1.0.5": "bzip2 and libbzip2 License v1.0.5",
70
+ "bzip2-1.0.6": "bzip2 and libbzip2 License v1.0.6",
71
+ "cal-1.0": "Cryptographic Autonomy License 1.0",
72
+ "cal-1.0-combined-work-exception": "Cryptographic Autonomy License 1.0 (Combined Work Exception)",
73
+ "caldera": "Caldera License",
74
+ "catosl-1.1": "Computer Associates Trusted Open Source License 1.1",
75
+ "cc-by-1.0": "Creative Commons Attribution 1.0 Generic",
76
+ "cc-by-2.0": "Creative Commons Attribution 2.0 Generic",
77
+ "cc-by-2.5": "Creative Commons Attribution 2.5 Generic",
78
+ "cc-by-3.0": "Creative Commons Attribution 3.0 Unported",
79
+ "cc-by-3.0-at": "Creative Commons Attribution 3.0 Austria",
80
+ "cc-by-3.0-us": "Creative Commons Attribution 3.0 United States",
81
+ "cc-by-4.0": "Creative Commons Attribution 4.0 International",
82
+ "cc-by-nc-1.0": "Creative Commons Attribution Non Commercial 1.0 Generic",
83
+ "cc-by-nc-2.0": "Creative Commons Attribution Non Commercial 2.0 Generic",
84
+ "cc-by-nc-2.5": "Creative Commons Attribution Non Commercial 2.5 Generic",
85
+ "cc-by-nc-3.0": "Creative Commons Attribution Non Commercial 3.0 Unported",
86
+ "cc-by-nc-4.0": "Creative Commons Attribution Non Commercial 4.0 International",
87
+ "cc-by-nc-nd-1.0": "Creative Commons Attribution Non Commercial No Derivatives 1.0 Generic",
88
+ "cc-by-nc-nd-2.0": "Creative Commons Attribution Non Commercial No Derivatives 2.0 Generic",
89
+ "cc-by-nc-nd-2.5": "Creative Commons Attribution Non Commercial No Derivatives 2.5 Generic",
90
+ "cc-by-nc-nd-3.0": "Creative Commons Attribution Non Commercial No Derivatives 3.0 Unported",
91
+ "cc-by-nc-nd-3.0-igo": "Creative Commons Attribution Non Commercial No Derivatives 3.0 IGO",
92
+ "cc-by-nc-nd-4.0": "Creative Commons Attribution Non Commercial No Derivatives 4.0 International",
93
+ "cc-by-nc-sa-1.0": "Creative Commons Attribution Non Commercial Share Alike 1.0 Generic",
94
+ "cc-by-nc-sa-2.0": "Creative Commons Attribution Non Commercial Share Alike 2.0 Generic",
95
+ "cc-by-nc-sa-2.5": "Creative Commons Attribution Non Commercial Share Alike 2.5 Generic",
96
+ "cc-by-nc-sa-3.0": "Creative Commons Attribution Non Commercial Share Alike 3.0 Unported",
97
+ "cc-by-nc-sa-4.0": "Creative Commons Attribution Non Commercial Share Alike 4.0 International",
98
+ "cc-by-nd-1.0": "Creative Commons Attribution No Derivatives 1.0 Generic",
99
+ "cc-by-nd-2.0": "Creative Commons Attribution No Derivatives 2.0 Generic",
100
+ "cc-by-nd-2.5": "Creative Commons Attribution No Derivatives 2.5 Generic",
101
+ "cc-by-nd-3.0": "Creative Commons Attribution No Derivatives 3.0 Unported",
102
+ "cc-by-nd-4.0": "Creative Commons Attribution No Derivatives 4.0 International",
103
+ "cc-by-sa-1.0": "Creative Commons Attribution Share Alike 1.0 Generic",
104
+ "cc-by-sa-2.0": "Creative Commons Attribution Share Alike 2.0 Generic",
105
+ "cc-by-sa-2.0-uk": "Creative Commons Attribution Share Alike 2.0 England and Wales",
106
+ "cc-by-sa-2.5": "Creative Commons Attribution Share Alike 2.5 Generic",
107
+ "cc-by-sa-3.0": "Creative Commons Attribution Share Alike 3.0 Unported",
108
+ "cc-by-sa-3.0-at": "Creative Commons Attribution-Share Alike 3.0 Austria",
109
+ "cc-by-sa-4.0": "Creative Commons Attribution Share Alike 4.0 International",
110
+ "cc-pddc": "Creative Commons Public Domain Dedication and Certification",
111
+ "cc0-1.0": "Creative Commons Zero v1.0 Universal",
112
+ "cddl-1.0": "Common Development and Distribution License 1.0",
113
+ "cddl-1.1": "Common Development and Distribution License 1.1",
114
+ "cdla-permissive-1.0": "Community Data License Agreement Permissive 1.0",
115
+ "cdla-sharing-1.0": "Community Data License Agreement Sharing 1.0",
116
+ "cecill-1.0": "CeCILL Free Software License Agreement v1.0",
117
+ "cecill-1.1": "CeCILL Free Software License Agreement v1.1",
118
+ "cecill-2.0": "CeCILL Free Software License Agreement v2.0",
119
+ "cecill-2.1": "CeCILL Free Software License Agreement v2.1",
120
+ "cecill-b": "CeCILL-B Free Software License Agreement",
121
+ "cecill-c": "CeCILL-C Free Software License Agreement",
122
+ "cern-ohl-1.1": "CERN Open Hardware Licence v1.1",
123
+ "cern-ohl-1.2": "CERN Open Hardware Licence v1.2",
124
+ "cern-ohl-p-2.0": "CERN Open Hardware Licence Version 2 - Permissive",
125
+ "cern-ohl-s-2.0": "CERN Open Hardware Licence Version 2 - Strongly Reciprocal",
126
+ "cern-ohl-w-2.0": "CERN Open Hardware Licence Version 2 - Weakly Reciprocal",
127
+ "clartistic": "Clarified Artistic License",
128
+ "cnri-jython": "CNRI Jython License",
129
+ "cnri-python": "CNRI Python License",
130
+ "cnri-python-gpl-compatible": "CNRI Python Open Source GPL Compatible License Agreement",
131
+ "condor-1.1": "Condor Public License v1.1",
132
+ "copyleft-next-0.3.0": "copyleft-next 0.3.0",
133
+ "copyleft-next-0.3.1": "copyleft-next 0.3.1",
134
+ "cpal-1.0": "Common Public Attribution License 1.0",
135
+ "cpl-1.0": "Common Public License 1.0",
136
+ "cpol-1.02": "Code Project Open License 1.02",
137
+ "crossword": "Crossword License",
138
+ "crystalstacker": "CrystalStacker License",
139
+ "cua-opl-1.0": "CUA Office Public License v1.0",
140
+ "cube": "Cube License",
141
+ "curl": "curl License",
142
+ "d-fsl-1.0": "Deutsche Freie Software Lizenz",
143
+ "diffmark": "diffmark license",
144
+ "doc": "DOC License",
145
+ "dotseqn": "Dotseqn License",
146
+ "dsdp": "DSDP License",
147
+ "dvipdfm": "dvipdfm License",
148
+ "ecl-1.0": "Educational Community License v1.0",
149
+ "ecl-2.0": "Educational Community License v2.0",
150
+ "ecos-2.0": "eCos license version 2.0",
151
+ "efl-1.0": "Eiffel Forum License v1.0",
152
+ "efl-2.0": "Eiffel Forum License v2.0",
153
+ "egenix": "eGenix.com Public License 1.1.0",
154
+ "entessa": "Entessa Public License v1.0",
155
+ "epics": "EPICS Open License",
156
+ "epl-1.0": "Eclipse Public License 1.0",
157
+ "epl-2.0": "Eclipse Public License 2.0",
158
+ "erlpl-1.1": "Erlang Public License v1.1",
159
+ "etalab-2.0": "Etalab Open License 2.0",
160
+ "eudatagrid": "EU DataGrid Software License",
161
+ "eupl-1.0": "European Union Public License 1.0",
162
+ "eupl-1.1": "European Union Public License 1.1",
163
+ "eupl-1.2": "European Union Public License 1.2",
164
+ "eurosym": "Eurosym License",
165
+ "fair": "Fair License",
166
+ "frameworx-1.0": "Frameworx Open License 1.0",
167
+ "freeimage": "FreeImage Public License v1.0",
168
+ "fsfap": "FSF All Permissive License",
169
+ "fsful": "FSF Unlimited License",
170
+ "fsfullr": "FSF Unlimited License (with License Retention)",
171
+ "ftl": "Freetype Project License",
172
+ "gfdl-1.1": "GNU Free Documentation License v1.1",
173
+ "gfdl-1.1-invariants-only": "GNU Free Documentation License v1.1 only - invariants",
174
+ "gfdl-1.1-invariants-or-later": "GNU Free Documentation License v1.1 or later - invariants",
175
+ "gfdl-1.1-no-invariants-only": "GNU Free Documentation License v1.1 only - no invariants",
176
+ "gfdl-1.1-no-invariants-or-later": "GNU Free Documentation License v1.1 or later - no invariants",
177
+ "gfdl-1.1-only": "GNU Free Documentation License v1.1 only",
178
+ "gfdl-1.1-or-later": "GNU Free Documentation License v1.1 or later",
179
+ "gfdl-1.2": "GNU Free Documentation License v1.2",
180
+ "gfdl-1.2-invariants-only": "GNU Free Documentation License v1.2 only - invariants",
181
+ "gfdl-1.2-invariants-or-later": "GNU Free Documentation License v1.2 or later - invariants",
182
+ "gfdl-1.2-no-invariants-only": "GNU Free Documentation License v1.2 only - no invariants",
183
+ "gfdl-1.2-no-invariants-or-later": "GNU Free Documentation License v1.2 or later - no invariants",
184
+ "gfdl-1.2-only": "GNU Free Documentation License v1.2 only",
185
+ "gfdl-1.2-or-later": "GNU Free Documentation License v1.2 or later",
186
+ "gfdl-1.3": "GNU Free Documentation License v1.3",
187
+ "gfdl-1.3-invariants-only": "GNU Free Documentation License v1.3 only - invariants",
188
+ "gfdl-1.3-invariants-or-later": "GNU Free Documentation License v1.3 or later - invariants",
189
+ "gfdl-1.3-no-invariants-only": "GNU Free Documentation License v1.3 only - no invariants",
190
+ "gfdl-1.3-no-invariants-or-later": "GNU Free Documentation License v1.3 or later - no invariants",
191
+ "gfdl-1.3-only": "GNU Free Documentation License v1.3 only",
192
+ "gfdl-1.3-or-later": "GNU Free Documentation License v1.3 or later",
193
+ "giftware": "Giftware License",
194
+ "gl2ps": "GL2PS License",
195
+ "glide": "3dfx Glide License",
196
+ "glulxe": "Glulxe License",
197
+ "glwtpl": "Good Luck With That Public License",
198
+ "gnuplot": "gnuplot License",
199
+ "gpl-1.0": "GNU General Public License v1.0 only",
200
+ "gpl-1.0+": "GNU General Public License v1.0 or later",
201
+ "gpl-1.0-only": "GNU General Public License v1.0 only",
202
+ "gpl-1.0-or-later": "GNU General Public License v1.0 or later",
203
+ "gpl-2.0": "GNU General Public License v2.0 only",
204
+ "gpl-2.0+": "GNU General Public License v2.0 or later",
205
+ "gpl-2.0-only": "GNU General Public License v2.0 only",
206
+ "gpl-2.0-or-later": "GNU General Public License v2.0 or later",
207
+ "gpl-2.0-with-autoconf-exception": "GNU General Public License v2.0 w/Autoconf exception",
208
+ "gpl-2.0-with-bison-exception": "GNU General Public License v2.0 w/Bison exception",
209
+ "gpl-2.0-with-classpath-exception": "GNU General Public License v2.0 w/Classpath exception",
210
+ "gpl-2.0-with-font-exception": "GNU General Public License v2.0 w/Font exception",
211
+ "gpl-2.0-with-gcc-exception": "GNU General Public License v2.0 w/GCC Runtime Library exception",
212
+ "gpl-3.0": "GNU General Public License v3.0 only",
213
+ "gpl-3.0+": "GNU General Public License v3.0 or later",
214
+ "gpl-3.0-only": "GNU General Public License v3.0 only",
215
+ "gpl-3.0-or-later": "GNU General Public License v3.0 or later",
216
+ "gpl-3.0-with-autoconf-exception": "GNU General Public License v3.0 w/Autoconf exception",
217
+ "gpl-3.0-with-gcc-exception": "GNU General Public License v3.0 w/GCC Runtime Library exception",
218
+ "gsoap-1.3b": "gSOAP Public License v1.3b",
219
+ "haskellreport": "Haskell Language Report License",
220
+ "hippocratic-2.1": "Hippocratic License 2.1",
221
+ "hpnd": "Historical Permission Notice and Disclaimer",
222
+ "hpnd-sell-variant": "Historical Permission Notice and Disclaimer - sell variant",
223
+ "htmltidy": "HTML Tidy License",
224
+ "ibm-pibs": "IBM PowerPC Initialization and Boot Software",
225
+ "icu": "ICU License",
226
+ "ijg": "Independent JPEG Group License",
227
+ "imagemagick": "ImageMagick License",
228
+ "imatix": "iMatix Standard Function Library Agreement",
229
+ "imlib2": "Imlib2 License",
230
+ "info-zip": "Info-ZIP License",
231
+ "intel": "Intel Open Source License",
232
+ "intel-acpi": "Intel ACPI Software License Agreement",
233
+ "interbase-1.0": "Interbase Public License v1.0",
234
+ "ipa": "IPA Font License",
235
+ "ipl-1.0": "IBM Public License v1.0",
236
+ "isc": "ISC License",
237
+ "jasper-2.0": "JasPer License",
238
+ "jpnic": "Japan Network Information Center License",
239
+ "json": "JSON License",
240
+ "lal-1.2": "Licence Art Libre 1.2",
241
+ "lal-1.3": "Licence Art Libre 1.3",
242
+ "latex2e": "Latex2e License",
243
+ "leptonica": "Leptonica License",
244
+ "lgpl-2.0": "GNU Library General Public License v2 only",
245
+ "lgpl-2.0+": "GNU Library General Public License v2 or later",
246
+ "lgpl-2.0-only": "GNU Library General Public License v2 only",
247
+ "lgpl-2.0-or-later": "GNU Library General Public License v2 or later",
248
+ "lgpl-2.1": "GNU Lesser General Public License v2.1 only",
249
+ "lgpl-2.1+": "GNU Library General Public License v2.1 or later",
250
+ "lgpl-2.1-only": "GNU Lesser General Public License v2.1 only",
251
+ "lgpl-2.1-or-later": "GNU Lesser General Public License v2.1 or later",
252
+ "lgpl-3.0": "GNU Lesser General Public License v3.0 only",
253
+ "lgpl-3.0+": "GNU Lesser General Public License v3.0 or later",
254
+ "lgpl-3.0-only": "GNU Lesser General Public License v3.0 only",
255
+ "lgpl-3.0-or-later": "GNU Lesser General Public License v3.0 or later",
256
+ "lgpllr": "Lesser General Public License For Linguistic Resources",
257
+ "libpng": "libpng License",
258
+ "libpng-2.0": "PNG Reference Library version 2",
259
+ "libselinux-1.0": "libselinux public domain notice",
260
+ "libtiff": "libtiff License",
261
+ "liliq-p-1.1": "Licence Libre du Qu\u00e9bec \u2013 Permissive version 1.1",
262
+ "liliq-r-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 version 1.1",
263
+ "liliq-rplus-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 forte version 1.1",
264
+ "linux-openib": "Linux Kernel Variant of OpenIB.org license",
265
+ "lpl-1.0": "Lucent Public License Version 1.0",
266
+ "lpl-1.02": "Lucent Public License v1.02",
267
+ "lppl-1.0": "LaTeX Project Public License v1.0",
268
+ "lppl-1.1": "LaTeX Project Public License v1.1",
269
+ "lppl-1.2": "LaTeX Project Public License v1.2",
270
+ "lppl-1.3a": "LaTeX Project Public License v1.3a",
271
+ "lppl-1.3c": "LaTeX Project Public License v1.3c",
272
+ "makeindex": "MakeIndex License",
273
+ "miros": "The MirOS Licence",
274
+ "mit": "MIT License",
275
+ "mit-0": "MIT No Attribution",
276
+ "mit-advertising": "Enlightenment License (e16)",
277
+ "mit-cmu": "CMU License",
278
+ "mit-enna": "enna License",
279
+ "mit-feh": "feh License",
280
+ "mit-open-group": "MIT Open Group variant",
281
+ "mitnfa": "MIT +no-false-attribs license",
282
+ "motosoto": "Motosoto License",
283
+ "mpich2": "mpich2 License",
284
+ "mpl-1.0": "Mozilla Public License 1.0",
285
+ "mpl-1.1": "Mozilla Public License 1.1",
286
+ "mpl-2.0": "Mozilla Public License 2.0",
287
+ "mpl-2.0-no-copyleft-exception": "Mozilla Public License 2.0 (no copyleft exception)",
288
+ "ms-pl": "Microsoft Public License",
289
+ "ms-rl": "Microsoft Reciprocal License",
290
+ "mtll": "Matrix Template Library License",
291
+ "mulanpsl-1.0": "Mulan Permissive Software License, Version 1",
292
+ "mulanpsl-2.0": "Mulan Permissive Software License, Version 2",
293
+ "multics": "Multics License",
294
+ "mup": "Mup License",
295
+ "nasa-1.3": "NASA Open Source Agreement 1.3",
296
+ "naumen": "Naumen Public License",
297
+ "nbpl-1.0": "Net Boolean Public License v1",
298
+ "ncgl-uk-2.0": "Non-Commercial Government Licence",
299
+ "ncsa": "University of Illinois/NCSA Open Source License",
300
+ "net-snmp": "Net-SNMP License",
301
+ "netcdf": "NetCDF license",
302
+ "newsletr": "Newsletr License",
303
+ "ngpl": "Nethack General Public License",
304
+ "nist-pd": "NIST Public Domain Notice",
305
+ "nist-pd-fallback": "NIST Public Domain Notice with license fallback",
306
+ "nlod-1.0": "Norwegian Licence for Open Government Data",
307
+ "nlpl": "No Limit Public License",
308
+ "nokia": "Nokia Open Source License",
309
+ "nosl": "Netizen Open Source License",
310
+ "noweb": "Noweb License",
311
+ "npl-1.0": "Netscape Public License v1.0",
312
+ "npl-1.1": "Netscape Public License v1.1",
313
+ "nposl-3.0": "Non-Profit Open Software License 3.0",
314
+ "nrl": "NRL License",
315
+ "ntp": "NTP License",
316
+ "ntp-0": "NTP No Attribution",
317
+ "nunit": "Nunit License",
318
+ "o-uda-1.0": "Open Use of Data Agreement v1.0",
319
+ "occt-pl": "Open CASCADE Technology Public License",
320
+ "oclc-2.0": "OCLC Research Public License 2.0",
321
+ "odbl-1.0": "ODC Open Database License v1.0",
322
+ "odc-by-1.0": "Open Data Commons Attribution License v1.0",
323
+ "ofl-1.0": "SIL Open Font License 1.0",
324
+ "ofl-1.0-no-rfn": "SIL Open Font License 1.0 with no Reserved Font Name",
325
+ "ofl-1.0-rfn": "SIL Open Font License 1.0 with Reserved Font Name",
326
+ "ofl-1.1": "SIL Open Font License 1.1",
327
+ "ofl-1.1-no-rfn": "SIL Open Font License 1.1 with no Reserved Font Name",
328
+ "ofl-1.1-rfn": "SIL Open Font License 1.1 with Reserved Font Name",
329
+ "ogc-1.0": "OGC Software License, Version 1.0",
330
+ "ogl-canada-2.0": "Open Government Licence - Canada",
331
+ "ogl-uk-1.0": "Open Government Licence v1.0",
332
+ "ogl-uk-2.0": "Open Government Licence v2.0",
333
+ "ogl-uk-3.0": "Open Government Licence v3.0",
334
+ "ogtsl": "Open Group Test Suite License",
335
+ "oldap-1.1": "Open LDAP Public License v1.1",
336
+ "oldap-1.2": "Open LDAP Public License v1.2",
337
+ "oldap-1.3": "Open LDAP Public License v1.3",
338
+ "oldap-1.4": "Open LDAP Public License v1.4",
339
+ "oldap-2.0": "Open LDAP Public License v2.0 (or possibly 2.0A and 2.0B)",
340
+ "oldap-2.0.1": "Open LDAP Public License v2.0.1",
341
+ "oldap-2.1": "Open LDAP Public License v2.1",
342
+ "oldap-2.2": "Open LDAP Public License v2.2",
343
+ "oldap-2.2.1": "Open LDAP Public License v2.2.1",
344
+ "oldap-2.2.2": "Open LDAP Public License 2.2.2",
345
+ "oldap-2.3": "Open LDAP Public License v2.3",
346
+ "oldap-2.4": "Open LDAP Public License v2.4",
347
+ "oldap-2.5": "Open LDAP Public License v2.5",
348
+ "oldap-2.6": "Open LDAP Public License v2.6",
349
+ "oldap-2.7": "Open LDAP Public License v2.7",
350
+ "oldap-2.8": "Open LDAP Public License v2.8",
351
+ "oml": "Open Market License",
352
+ "openssl": "OpenSSL License",
353
+ "opl-1.0": "Open Public License v1.0",
354
+ "oset-pl-2.1": "OSET Public License version 2.1",
355
+ "osl-1.0": "Open Software License 1.0",
356
+ "osl-1.1": "Open Software License 1.1",
357
+ "osl-2.0": "Open Software License 2.0",
358
+ "osl-2.1": "Open Software License 2.1",
359
+ "osl-3.0": "Open Software License 3.0",
360
+ "parity-6.0.0": "The Parity Public License 6.0.0",
361
+ "parity-7.0.0": "The Parity Public License 7.0.0",
362
+ "pddl-1.0": "ODC Public Domain Dedication & License 1.0",
363
+ "php-3.0": "PHP License v3.0",
364
+ "php-3.01": "PHP License v3.01",
365
+ "plexus": "Plexus Classworlds License",
366
+ "polyform-noncommercial-1.0.0": "PolyForm Noncommercial License 1.0.0",
367
+ "polyform-small-business-1.0.0": "PolyForm Small Business License 1.0.0",
368
+ "postgresql": "PostgreSQL License",
369
+ "psf-2.0": "Python Software Foundation License 2.0",
370
+ "psfrag": "psfrag License",
371
+ "psutils": "psutils License",
372
+ "python-2.0": "Python License 2.0",
373
+ "qhull": "Qhull License",
374
+ "qpl-1.0": "Q Public License 1.0",
375
+ "rdisc": "Rdisc License",
376
+ "rhecos-1.1": "Red Hat eCos Public License v1.1",
377
+ "rpl-1.1": "Reciprocal Public License 1.1",
378
+ "rpl-1.5": "Reciprocal Public License 1.5",
379
+ "rpsl-1.0": "RealNetworks Public Source License v1.0",
380
+ "rsa-md": "RSA Message-Digest License",
381
+ "rscpl": "Ricoh Source Code Public License",
382
+ "ruby": "Ruby License",
383
+ "sax-pd": "Sax Public Domain Notice",
384
+ "saxpath": "Saxpath License",
385
+ "scea": "SCEA Shared Source License",
386
+ "sendmail": "Sendmail License",
387
+ "sendmail-8.23": "Sendmail License 8.23",
388
+ "sgi-b-1.0": "SGI Free Software License B v1.0",
389
+ "sgi-b-1.1": "SGI Free Software License B v1.1",
390
+ "sgi-b-2.0": "SGI Free Software License B v2.0",
391
+ "shl-0.5": "Solderpad Hardware License v0.5",
392
+ "shl-0.51": "Solderpad Hardware License, Version 0.51",
393
+ "simpl-2.0": "Simple Public License 2.0",
394
+ "sissl": "Sun Industry Standards Source License v1.1",
395
+ "sissl-1.2": "Sun Industry Standards Source License v1.2",
396
+ "sleepycat": "Sleepycat License",
397
+ "smlnj": "Standard ML of New Jersey License",
398
+ "smppl": "Secure Messaging Protocol Public License",
399
+ "snia": "SNIA Public License 1.1",
400
+ "spencer-86": "Spencer License 86",
401
+ "spencer-94": "Spencer License 94",
402
+ "spencer-99": "Spencer License 99",
403
+ "spl-1.0": "Sun Public License v1.0",
404
+ "ssh-openssh": "SSH OpenSSH license",
405
+ "ssh-short": "SSH short notice",
406
+ "sspl-1.0": "Server Side Public License, v 1",
407
+ "standardml-nj": "Standard ML of New Jersey License",
408
+ "sugarcrm-1.1.3": "SugarCRM Public License v1.1.3",
409
+ "swl": "Scheme Widget Library (SWL) Software License Agreement",
410
+ "tapr-ohl-1.0": "TAPR Open Hardware License v1.0",
411
+ "tcl": "TCL/TK License",
412
+ "tcp-wrappers": "TCP Wrappers License",
413
+ "tmate": "TMate Open Source License",
414
+ "torque-1.1": "TORQUE v2.5+ Software License v1.1",
415
+ "tosl": "Trusster Open Source License",
416
+ "tu-berlin-1.0": "Technische Universitaet Berlin License 1.0",
417
+ "tu-berlin-2.0": "Technische Universitaet Berlin License 2.0",
418
+ "ucl-1.0": "Upstream Compatibility License v1.0",
419
+ "unicode-dfs-2015": "Unicode License Agreement - Data Files and Software (2015)",
420
+ "unicode-dfs-2016": "Unicode License Agreement - Data Files and Software (2016)",
421
+ "unicode-tou": "Unicode Terms of Use",
422
+ "unlicense": "The Unlicense",
423
+ "upl-1.0": "Universal Permissive License v1.0",
424
+ "vim": "Vim License",
425
+ "vostrom": "VOSTROM Public License for Open Source",
426
+ "vsl-1.0": "Vovida Software License v1.0",
427
+ "w3c": "W3C Software Notice and License (2002-12-31)",
428
+ "w3c-19980720": "W3C Software Notice and License (1998-07-20)",
429
+ "w3c-20150513": "W3C Software Notice and Document License (2015-05-13)",
430
+ "watcom-1.0": "Sybase Open Watcom Public License 1.0",
431
+ "wsuipa": "Wsuipa License",
432
+ "wtfpl": "Do What The F*ck You Want To Public License",
433
+ "wxwindows": "wxWindows Library License",
434
+ "x11": "X11 License",
435
+ "xerox": "Xerox License",
436
+ "xfree86-1.1": "XFree86 License 1.1",
437
+ "xinetd": "xinetd License",
438
+ "xnet": "X.Net License",
439
+ "xpp": "XPP License",
440
+ "xskat": "XSkat License",
441
+ "ypl-1.0": "Yahoo! Public License v1.0",
442
+ "ypl-1.1": "Yahoo! Public License v1.1",
443
+ "zed": "Zed License",
444
+ "zend-2.0": "Zend License v2.0",
445
+ "zimbra-1.3": "Zimbra Public License v1.3",
446
+ "zimbra-1.4": "Zimbra Public License v1.4",
447
+ "zlib": "zlib License",
448
+ "zlib-acknowledgement": "zlib/libpng License with Acknowledgement",
449
+ "zpl-1.1": "Zope Public License 1.1",
450
+ "zpl-2.0": "Zope Public License 2.0",
451
+ "zpl-2.1": "Zope Public License 2.1"
452
+ }
tag_set.json DELETED
@@ -1 +0,0 @@
1
- {"task_structure": {"Txt2Class": "text to classification task", "Txt2Class.Bi": "text to binary classification task", "Txt2Class.Multi.Sing": "text to multiple classes single label", "Txt2Class.Multi.Multi": "text to multiple classes multiple labels", "Strct2Txt": "structured information to text task", "Txt2Strct": "text to structured information task", "Txt2Txt": "text to text task", "Txt": "just text", "Oth": "other"}, "purpose": {"NLI": "natural language inference", "SentA": "sentiment analysis", "MT": "machine translation", "Summ.ext": "extractive summarization", "Summ.abs": "abstractive summarization", "QA.abs": "abstractive question answering", "QA.ext": "extractive question answering", "QA.open": "open domain question answering", "QA.closed": "closed domain question answering", "QA.open.abs": "open domain abstractive question answering", "QA.closed.abs": "closed domain abstractive question answering", "QA.open.ext": "open domain extractive question answering", "QA.closed.ext": "closed domain extractive question answering", "Dialog": "dialogue or multi-turn text", "LM": "language modeling", "NER": "named entity recognition", "Pars": "parsing", "TxtSimp": "text simplification", "Coref": "coreference resolution", "FactChk": "fact checking", "EntLink": "entity linking", "SSplitFus": "sentence splitting/fusion", "SlotFillClz": "slot filling / Cloze test", "InfoRet": "information retrieval", "IntentClass": "intent classification", "SemSim": "semantic similarity", "Oth": "other"}, "language_producers": {"crwdsrc_l": "data produced by crowdsource workers", "machgen_l": "machine-generated data", "found_l": "found data", "Oth": "other"}, "annotation": {"crwdsrc_a": "annotation produced by crowdsource workers", "machgen_a": "machine-generated annotation", "exp_a": "expert annotation", "no_a": "no annotation", "Oth": "other"}, "license": {"afl-3.0": "Academic Free License", "apache-2.0": "Apache license 2.0", "artistic-2.0": "Artistic license 2.0", "bsl-1.0": "Boost Software License 1.0", "bsd-2-clause": "BSD 2-clause \"Simplified\" license", "bsd-3-clause": "BSD 3-clause \"New\" or \"Revised\" license", "bsd-3-clause-clear": "BSD 3-clause Clear license", "cc": "Creative Commons license family", "cc0-1.0": "Creative Commons Zero v1.0 Universal", "cc-by-4.0": "Creative Commons Attribution 4.0", "cc-by-sa-4.0": "Creative Commons Attribution Share Alike 4.0", "wtfpl": "Do What The F*ck You Want To Public License", "ecl-2.0": "Educational Community License v2.0", "epl-1.0": "Eclipse Public License 1.0", "epl-2.0": "Eclipse Public License 2.0", "eupl-1.1": "European Union Public License 1.1", "agpl-3.0": "GNU Affero General Public License v3.0", "gpl": "GNU General Public License family", "gpl-2.0": "GNU General Public License v2.0", "gpl-3.0": "GNU General Public License v3.0", "lgpl": "GNU Lesser General Public License family", "lgpl-2.1": "GNU Lesser General Public License v2.1", "lgpl-3.0": "GNU Lesser General Public License v3.0", "isc": "ISC", "lppl-1.3c": "LaTeX Project Public License v1.3c", "ms-pl": "Microsoft Public License", "mit": "MIT", "mpl-2.0": "Mozilla Public License 2.0", "osl-3.0": "Open Software License 3.0", "postgresql": "PostgreSQL License", "ofl-1.1": "SIL Open Font License 1.1", "ncsa": "University of Illinois/NCSA Open Source License", "unlicense": "The Unlicense", "zlib": "zLib License", "Oth": "other"}, "language": {"cardinality": {"1ling": "monolingual; only one language in the dataset", "trsl": "translation; parallel language use", "multiling": "multilingual; more than one language being used within or across datasets over different content", "Oth": "other"}, "BCP-47": {"en": "English, dialect unknown", "es": "Spanish, dialect unknown", "fr": "French, dialect unknown", "sv": "Swedish, dialect unknown", "fi": "Finnish, dialect unknown", "de": "German, dialect unknown", "ru": "Russian, dialect unknown", "uk": "Ukranian, dialect unknown", "it": "Italian, dialect unknown", "eo": "Esperanto, dialect unknown", "ar": "Arabic, dialect unknown", "tr": "Turkish, dialect unknown", "bg": "Bulgarian, dialect unknown", "pl": "Polish, dialect unknown", "nl": "Dutch, dialect unknown", "id": "Indonesian, dialect unknown", "zh": "Chinese, dialect unknown", "af": "Afrikaans, dialect unknown", "ca": "Catalan, dialect unknown", "cs": "Czech, dialect unknown", "pt": "Portuguese, dialect unknown", "no": "Norwegian, dialect unknown", "he": "Hebrew, dialect unknown", "da": "Danish, dialect unknown", "is": "Icelandic, dialect unknown", "hu": "Hungarian, dialect unknown", "ro": "Romanian, dialect unknown", "ms": "Malay, dialect unknown", "ja": "Japanese, dialect unknown", "hi": "Hindi, dialect unknown", "sl": "Slovene, dialect unknown", "lt": "Lithuanian, dialect unknown", "ht": "Haitian, dialect unknown", "vi": "Vietnamese, dialect unknown", "et": "Estonian, dialect unknown", "el": "Greek, dialect unknown", "hr": "Croatian, dialect unknown", "mt": "Maltese, dialect unknown", "ts": "Tsonga, dialect unknown", "mk": "Macedonian, dialect unknown", "ln": "Lingala, dialect unknown", "ig": "Igbo, dialect unknown", "ee": "Ewe, dialect unknown", "xh": "Xhosa, dialect unknown", "sn": "Shona, dialect unknown", "rw": "Kinyarwanda, dialect unknown", "ny": "Chichewa, dialect unknown", "lv": "Latvian, dialect unknown", "lg": "Ganda, dialect unknown", "ko": "Korean, dialect unknown", "gl": "Galician, dialect unknown", "sg": "Sango, dialect unknown", "yo": "Yoruba, dialect unknown", "ur": "Urdu, dialect unknown", "rn": "Kirundi, dialect unknown", "mr": "Marathi, dialect unknown", "bn": "Bengali, dialect unknown", "nso": "Pedi, dialect unknown", "ty": "Tahitian, dialect unknown", "to": "Tonga, dialect unknown", "gu": "Gujarati, dialect unknown", "eu": "Basque, dialect unknown", "niu": "Niuean, dialect unknown", "guw": "Gun, dialect unknown", "gaa": "Ga, dialect unknown", "crs": "Seselwa Creole French, dialect unknown", "bcl": "Central Bikol, dialect unknown", "tn": "Tswana, dialect unknown", "sm": "Samoan, dialect unknown", "si": "Sinhala, dialect unknown", "nn": "Norwegian Nynorsk, dialect unknown", "nb": "Norwegian Bokm\u00e5l, dialect unknown", "fj": "Fijian, dialect unknown", "be": "Belarusian, dialect unknown", "pon": "Pohnpeian, dialect unknown", "pis": "Pijin, dialect unknown", "pap": "Papiamento, dialect unknown", "pag": "Pangasinan, dialect unknown", "lua": "Luba-Lulua, dialect unknown", "iso": "Isoko, dialect unknown", "ilo": "Iloko, dialect unknown", "gil": "Gilbertese, dialect unknown", "efi": "Efik, dialect unknown", "bzs": "Brazilian Sign Language, dialect unknown", "yi": "Yiddish, dialect unknown", "wa": "Walloon, dialect unknown", "sq": "Albanian, dialect unknown", "or": "Oriya, dialect unknown", "mh": "Marshallese, dialect unknown", "lb": "Luxembourgish, dialect unknown", "ha": "Hausa, dialect unknown", "fy": "Western Frisian, dialect unknown", "fo": "Faroese, dialect unknown", "as": "Assamese, dialect unknown", "tvl": "Tuvalua, dialect unknown", "tll": "Tetela, dialect unknown", "swc": "Congo Swahili, dialect unknown", "lus": "Lushai, dialect unknown", "loz": "Lozi, dialect unknown", "ceb": "Cebuano, dialect unknown", "ti": "Tigrinya, dialect unknown", "st": "Southern Sotho, dialect unknown", "rm": "Romansh, dialect unknown", "oc": "Occitan, dialect unknown", "kg": "Kongo, dialect unknown", "ga": "Irish, dialect unknown", "co": "Corsican, dialect unknown", "an": "Aragonese, dialect unknown", "war": "Waray, dialect unknown", "lue": "Luvale, dialect unknown", "hil": "Hiligaynon, dialect unknown", "bem": "Bemba, dialect unknown", "ase": "American Sign Language, dialect unknown", "zu": "Zulu, dialect unknown", "tw": "Twi, dialect unknown", "tl": "Tagalog, dialect unknown", "sk": "Slovak, dialect unknown", "lu": "Luba-Katanga, dialect unknown", "hy": "Armenian, dialect unknown", "gv": "Manx, dialect unknown", "cy": "Welsh, dialect unknown", "bi": "Bislama, dialect unknown", "am": "Amharic, dialect unknown", "srn": "Sranan Tongo, dialect unknown", "toi": "Tonga (Zambia), dialect unknown", "kqn": "Kaonde, dialect unknown", "se": "Northern Sami, dialect unknown", "ps": "Pashto, dialect unknown", "os": "Ossetian, dialect unknown", "zne": "Zande (individual language), dialect unknown", "wls": "Wallisian, dialect unknown", "tpi": "Tok Pisin, dialect unknown", "tiv": "Tiv, dialect unknown", "run": "Rundi, dialect unknown", "so": "Somali, dialect unknown", "kw": "Cornish, dialect unknown", "ho": "Hiri Motu, dialect unknown", "gd": "Scottish Gaelic, dialect unknown", "br": "Breton, dialect unknown", "tum": "Tumbuka, dialect unknown", "yap": "Yapese, dialect unknown", "rnd": "Ruund, dialect unknown", "mfe": "Morisyen, dialect unknown", "kwy": "San Salvador Kongo, dialect unknown", "chk": "Chuukese, dialect unknown", "ber": "Berber languages, dialect unknown", "wo": "Wolof, dialect unknown", "ve": "Venda, dialect unknown", "th": "Thai, dialect unknown", "sc": "Sardinian, dialect unknown", "ml": "Malayalam, dialect unknown", "mg": "Malagasy, dialect unknown", "km": "Khmer, dialect unknown", "ka": "Georgian, dialect unknown", "mos": "Mossi, dialect unknown", "ta": "Tamil, dialect unknown", "mn": "Mongolian, dialect unknown", "kn": "Kannada, dialect unknown", "az": "Azerbaijani, dialect unknown", "roa": "Romance languages, dialect unknown", "yue": "Yue Chinese, dialect unknown", "tt": "Tatar, dialect unknown", "tk": "Turkmen, dialect unknown", "te": "Telugu, dialect unknown", "na": "Nauru, dialect unknown", "mi": "M\u0101ori, dialect unknown", "cv": "Chuvash, dialect unknown", "ba": "Bashkir, dialect unknown", "cel": "Celtic languages, dialect unknown", "umb": "Umbundu, dialect unknown", "sa": "Sanskrit, dialect unknown", "my": "Burmese, dialect unknown", "lo": "Lao, dialect unknown", "kl": "Kalaallisut, dialect unknown", "io": "Ido, dialect unknown", "ce": "Chechen, dialect unknown", "ab": "Abkhaz, dialect unknown", "fse": "Finnish Sign Language, dialect unknown", "zai": "Isthmus Zapotec, dialect unknown", "tzo": "Tzotzil, dialect unknown", "prl": "Peruvian Sign Language, dialect unknown", "mfs": "Mexican Sign Language, dialect unknown", "nyk": "Nyaneka, dialect unknown", "luo": "Luo, dialect unknown", "lun": "Lunda, dialect unknown", "kwn": "Kwangali, dialect unknown", "csn": "Colombian Sign Language, dialect unknown", "csg": "Chilean Sign Language, dialect unknown", "aed": "Argentine Sign Language, dialect unknown", "sw": "Swahili, dialect unknown", "su": "Sundanese, dialect unknown", "ss": "Swati, dialect unknown", "om": "Oromo, dialect unknown", "nv": "Navajo, dialect unknown", "ng": "Ndonga, dialect unknown", "ne": "Nepali, dialect unknown", "kj": "Kwanyama, dialect unknown", "jv": "Javanese, dialect unknown", "gn": "Guaran\u00ed, dialect unknown", "fa": "Persian, dialect unknown", "ch": "Chamorro, dialect unknown", "bo": "Tibetan Standard, dialect unknown", "wal": "Wolaitta, dialect unknown", "vsl": "Venezuelan Sign Language, dialect unknown", "ssp": "Spanish Sign Language, dialect unknown", "kab": "Kabyle, dialect unknown", "yua": "Yucateco, dialect unknown", "tdt": "Tetun Dili, dialect unknown", "pa": "Punjabi, dialect unknown", "nr": "Southern Ndebele, dialect unknown", "kk": "Kazakh, dialect unknown", "dv": "Divehi, dialect unknown", "Oth": "other"}}}
 
 
tagging_app.py CHANGED
@@ -16,97 +16,17 @@ st.beta_set_page_config(
16
  initial_sidebar_state="auto",
17
  )
18
 
19
- task_set = {
20
- "conditional-text-generation": {
21
- "description": "data-to-text and text transduction tasks such as translation or summarization",
22
- "options": [
23
- "machine-translation",
24
- "sentence-splitting-fusion",
25
- "summarization",
26
- "table-to-text",
27
- "text-simplification",
28
- "explanation-generation",
29
- "other",
30
- ],
31
- },
32
- "question-answering": {
33
- "description": "question answering tasks",
34
- "options": [
35
- "open-domain-qa",
36
- "closed-domain-qa",
37
- "multiple-choice-qa",
38
- "extractive-qa",
39
- "abstractive-qa",
40
- "other",
41
- ],
42
- },
43
- "sequence-modeling": {
44
- "description": "such as language modeling or dialogue",
45
- "options": [
46
- "dialogue-modeling",
47
- "language-modeling",
48
- "other-multi-turn",
49
- "slot-filling",
50
- "other",
51
- ],
52
- },
53
- "structure-prediction": {
54
- "description": "predicting structural properties of the text, such as syntax",
55
- "options": [
56
- "coreference-resolution",
57
- "named-entity-recognition",
58
- "parsing",
59
- "other",
60
- ],
61
- },
62
- "text-classification": {
63
- "description": "predicting a class index or boolean value",
64
- "options": [
65
- "acceptability-classification",
66
- "entity-linking-classification",
67
- "fact-checking",
68
- "intent-classification",
69
- "multi-class-classification",
70
- "multi-label-classification",
71
- "natural-language-inference",
72
- "semantic-similarity-classification",
73
- "sentiment-classification",
74
- "topic-classification",
75
- "other",
76
- ],
77
- },
78
- "text-retrieval": {
79
- "description": "information or text retrieval tasks",
80
- "options": [
81
- "document-retrieval",
82
- "utterance-retrieval",
83
- "entity-linking-retrieval",
84
- "fact-checking-retrieval",
85
- "other",
86
- ],
87
- },
88
- "text-scoring": {
89
- "description": "text scoring tasks, predicting a real valued score for some text",
90
- "options": [
91
- "semantic-similarity-scoring",
92
- "sentiment-scoring",
93
- "other",
94
- ],
95
- },
96
- "other": {
97
- "description": "other task family not mentioned here",
98
- "options": [
99
- "other",
100
- ],
101
- },
102
- }
103
 
104
  multilinguality_set = {
105
  "monolingual": "contains a single language",
106
  "multilingual": "contains multiple languages",
107
  "translation": "contains translated or aligned text",
108
  "other": "other type of language distribution",
109
- }
110
 
111
  creator_set = {
112
  "language": [
@@ -126,51 +46,7 @@ creator_set = {
126
  ],
127
  }
128
 
129
- license_set = {
130
- 'afl-3.0': 'Academic Free License',
131
- 'apache-2.0': 'Apache license 2.0',
132
- 'artistic-2.0': 'Artistic license 2.0',
133
- 'bsl-1.0': 'Boost Software License 1.0',
134
- 'bsd-2-clause': 'BSD 2-clause "Simplified" license',
135
- 'bsd-3-clause': 'BSD 3-clause "New" or "Revised" license',
136
- 'bsd-3-clause-clear': 'BSD 3-clause Clear license',
137
- 'cc': 'Creative Commons license family',
138
- 'cc0-1.0': 'Creative Commons Zero v1.0 Universal',
139
- 'cc-by-sa-3.0': 'Creative Commons Attribution Share Alike 3.0',
140
- 'cc-by-4.0': 'Creative Commons Attribution 4.0',
141
- 'cc-by-nc-4.0': 'Creative Commons Attribution Non Commercial 4.0',
142
- 'cc-by-nc-sa-4.0': 'Creative Commons Attribution Non Commercial Share Alike 4.0',
143
- 'cc-by-sa-4.0': 'Creative Commons Attribution Share Alike 4.0',
144
- 'wtfpl': 'Do What The F*ck You Want To Public License',
145
- 'ecl-2.0': 'Educational Community License v2.0',
146
- 'epl-1.0': 'Eclipse Public License 1.0',
147
- 'epl-2.0': 'Eclipse Public License 2.0',
148
- 'eupl-1.1': 'European Union Public License 1.1',
149
- 'agpl-3.0': 'GNU Affero General Public License v3.0',
150
- 'gpl': 'GNU General Public License family',
151
- 'gpl-2.0': 'GNU General Public License v2.0',
152
- 'gpl-3.0': 'GNU General Public License v3.0',
153
- 'lgpl': 'GNU Lesser General Public License family',
154
- 'lgpl-2.1': 'GNU Lesser General Public License v2.1',
155
- 'lgpl-3.0': 'GNU Lesser General Public License v3.0',
156
- 'isc': 'ISC',
157
- 'lppl-1.3c': 'LaTeX Project Public License v1.3c',
158
- 'ms-pl': 'Microsoft Public License',
159
- 'mit': 'MIT',
160
- 'mpl-2.0': 'Mozilla Public License 2.0',
161
- 'osl-3.0': 'Open Software License 3.0',
162
- 'postgresql': 'PostgreSQL License',
163
- 'ofl-1.1': 'SIL Open Font License 1.1',
164
- 'ncsa': 'University of Illinois/NCSA Open Source License',
165
- 'unlicense': 'The Unlicense',
166
- 'zlib': 'zLib License',
167
- 'other': 'other license',
168
- 'unknown': 'could not find license information',
169
- }
170
 
171
- tag_set = json.load(open('tag_set.json'))
172
- language_set = dict([(k, v.replace(', dialect unknown', ''))
173
- for k, v in tag_set['language']["BCP-47"].items()])
174
 
175
  ########################
176
  ## Helper functions
@@ -205,7 +81,7 @@ def filter_features(feature_dict):
205
  return {
206
  "feature_type": feature_dict["_type"],
207
  "dtype": "string",
208
- "languages": feature_dict["languages"],
209
  }
210
  else:
211
  return dict([(k, filter_features(v)) for k, v in feature_dict.items()])
@@ -271,12 +147,12 @@ st.sidebar.markdown(
271
  )
272
 
273
  app_desc = """
274
- ### Dataset Tagger
275
 
276
- This app aims to make it easier to add structured tags to the datasets present in the library.
277
 
278
  Each configuration requires its own tasks, as these often correspond to distinct sub-tasks. However, we provide the opportunity
279
- to pre-load the tag sets from another dataset or configuration to avoid too much redundancy.
280
 
281
  The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
282
  """
@@ -390,13 +266,13 @@ with c2.beta_expander("- Choose tag set to pre-load"):
390
 
391
  pre_loaded["languages"] = list(set(pre_loaded["languages"] + find_languages(features)))
392
  if config_infos["license"] in license_set:
393
- pre_loaded["licenses"] = list(set(pre_loaded["licenses"] + [config_infos["license"]]))
394
 
395
  ##########
396
  # Modify or add new tags
397
  ##########
398
  c2.markdown("#### Editing the tag set")
399
- c2.markdown("> *Expand the following boxes to edit the tag set. For each of the questions, choose all that apply, at least one option:*")
400
 
401
  with c2.beta_expander("- Supported tasks"):
402
  task_categories = st.multiselect(
@@ -414,13 +290,13 @@ with c2.beta_expander("- Supported tasks"):
414
  )
415
  if "other" in task_specs:
416
  other_task = st.text_input(
417
- "You selected 'other' task. Please enter a short hyphen-separated description for the task:",
418
  value='my-task-description',
419
  )
420
  st.write(f"Registering {tg}-other-{other_task} task")
421
  task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
422
  task_specifics += task_specs
423
-
424
  with c2.beta_expander("- Languages"):
425
  multilinguality = st.multiselect(
426
  "Does the dataset contain more than one language?",
@@ -430,7 +306,7 @@ with c2.beta_expander("- Languages"):
430
  )
431
  if "other" in multilinguality:
432
  other_multilinguality = st.text_input(
433
- "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
434
  value='my-multilinguality',
435
  )
436
  st.write(f"Registering other-{other_multilinguality} multilinguality")
@@ -461,7 +337,7 @@ with c2.beta_expander("- Dataset creators"):
461
  )
462
  if "other" in licenses:
463
  other_license = st.text_input(
464
- "You selected 'other' type of license. Please enter a short hyphen-separated description:",
465
  value='my-license',
466
  )
467
  st.write(f"Registering other-{other_license} license")
@@ -487,13 +363,13 @@ with c2.beta_expander("- Dataset creators"):
487
  )
488
  if "other" in extended_sources:
489
  other_extended_sources = st.text_input(
490
- "You selected 'other' dataset. Please enter a short hyphen-separated description:",
491
  value='my-dataset',
492
  )
493
  st.write(f"Registering other-{other_extended_sources} dataset")
494
  extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
495
  source_datasets += [f"extended|{src}" for src in extended_sources]
496
-
497
  num_examples = (
498
  sum([dct.get('num_examples', 0) for spl, dct in config_infos['splits'].items()])
499
  if config_infos.get('splits', None) is not None
@@ -511,7 +387,7 @@ elif num_examples < 1000000:
511
  size_cat = "100K<n<1M"
512
  else:
513
  size_cat = "n>1M"
514
-
515
  res = {
516
  "task_categories": task_categories,
517
  "task_ids": task_specifics,
@@ -535,7 +411,7 @@ if c3.button("Done? Save to File!"):
535
  if not os.path.isdir(pjoin('saved_tags', dataset_id, config_id)):
536
  _ = os.mkdir(pjoin('saved_tags', dataset_id, config_id))
537
  json.dump(res, open(pjoin('saved_tags', dataset_id, config_id, 'tags.json'), 'w'))
538
-
539
  with c3.beta_expander("Show JSON output"):
540
  st.write(res)
541
 
@@ -546,4 +422,3 @@ c3.markdown("--- ")
546
 
547
  with c3.beta_expander("----> show full task set <----", expanded=True):
548
  st.write(task_set)
549
-
 
16
  initial_sidebar_state="auto",
17
  )
18
 
19
+ task_set = json.load(open("task_set.json"))
20
+ license_set = json.load(open("license_set.json"))
21
+ language_set = json.load(open("language_set.json"))
22
+ language_set_full = json.load(open("language_set_full.json"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  multilinguality_set = {
25
  "monolingual": "contains a single language",
26
  "multilingual": "contains multiple languages",
27
  "translation": "contains translated or aligned text",
28
  "other": "other type of language distribution",
29
+ }
30
 
31
  creator_set = {
32
  "language": [
 
46
  ],
47
  }
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
 
 
50
 
51
  ########################
52
  ## Helper functions
 
81
  return {
82
  "feature_type": feature_dict["_type"],
83
  "dtype": "string",
84
+ "languages": feature_dict["languages"],
85
  }
86
  else:
87
  return dict([(k, filter_features(v)) for k, v in feature_dict.items()])
 
147
  )
148
 
149
  app_desc = """
150
+ ### Dataset Tagger
151
 
152
+ This app aims to make it easier to add structured tags to the datasets present in the library.
153
 
154
  Each configuration requires its own tasks, as these often correspond to distinct sub-tasks. However, we provide the opportunity
155
+ to pre-load the tag sets from another dataset or configuration to avoid too much redundancy.
156
 
157
  The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
158
  """
 
266
 
267
  pre_loaded["languages"] = list(set(pre_loaded["languages"] + find_languages(features)))
268
  if config_infos["license"] in license_set:
269
+ pre_loaded["licenses"] = list(set(pre_loaded["licenses"] + [config_infos["license"]]))
270
 
271
  ##########
272
  # Modify or add new tags
273
  ##########
274
  c2.markdown("#### Editing the tag set")
275
+ c2.markdown("> *Expand the following boxes to edit the tag set. For each of the questions, choose all that apply, at least one option:*")
276
 
277
  with c2.beta_expander("- Supported tasks"):
278
  task_categories = st.multiselect(
 
290
  )
291
  if "other" in task_specs:
292
  other_task = st.text_input(
293
+ "You selected 'other' task. Please enter a short hyphen-separated description for the task:",
294
  value='my-task-description',
295
  )
296
  st.write(f"Registering {tg}-other-{other_task} task")
297
  task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
298
  task_specifics += task_specs
299
+
300
  with c2.beta_expander("- Languages"):
301
  multilinguality = st.multiselect(
302
  "Does the dataset contain more than one language?",
 
306
  )
307
  if "other" in multilinguality:
308
  other_multilinguality = st.text_input(
309
+ "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
310
  value='my-multilinguality',
311
  )
312
  st.write(f"Registering other-{other_multilinguality} multilinguality")
 
337
  )
338
  if "other" in licenses:
339
  other_license = st.text_input(
340
+ "You selected 'other' type of license. Please enter a short hyphen-separated description:",
341
  value='my-license',
342
  )
343
  st.write(f"Registering other-{other_license} license")
 
363
  )
364
  if "other" in extended_sources:
365
  other_extended_sources = st.text_input(
366
+ "You selected 'other' dataset. Please enter a short hyphen-separated description:",
367
  value='my-dataset',
368
  )
369
  st.write(f"Registering other-{other_extended_sources} dataset")
370
  extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
371
  source_datasets += [f"extended|{src}" for src in extended_sources]
372
+
373
  num_examples = (
374
  sum([dct.get('num_examples', 0) for spl, dct in config_infos['splits'].items()])
375
  if config_infos.get('splits', None) is not None
 
387
  size_cat = "100K<n<1M"
388
  else:
389
  size_cat = "n>1M"
390
+
391
  res = {
392
  "task_categories": task_categories,
393
  "task_ids": task_specifics,
 
411
  if not os.path.isdir(pjoin('saved_tags', dataset_id, config_id)):
412
  _ = os.mkdir(pjoin('saved_tags', dataset_id, config_id))
413
  json.dump(res, open(pjoin('saved_tags', dataset_id, config_id, 'tags.json'), 'w'))
414
+
415
  with c3.beta_expander("Show JSON output"):
416
  st.write(res)
417
 
 
422
 
423
  with c3.beta_expander("----> show full task set <----", expanded=True):
424
  st.write(task_set)
 
task_set.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "conditional-text-generation": {
3
+ "description": "data-to-text and text transduction tasks such as translation or summarization",
4
+ "options": [
5
+ "machine-translation",
6
+ "sentence-splitting-fusion",
7
+ "summarization",
8
+ "table-to-text",
9
+ "text-simplification",
10
+ "explanation-generation",
11
+ "other"
12
+ ]
13
+ },
14
+ "question-answering": {
15
+ "description": "question answering tasks",
16
+ "options": [
17
+ "open-domain-qa",
18
+ "closed-domain-qa",
19
+ "multiple-choice-qa",
20
+ "extractive-qa",
21
+ "abstractive-qa",
22
+ "other"
23
+ ]
24
+ },
25
+ "sequence-modeling": {
26
+ "description": "such as language modeling or dialogue",
27
+ "options": [
28
+ "dialogue-modeling",
29
+ "language-modeling",
30
+ "other-multi-turn",
31
+ "slot-filling",
32
+ "other"
33
+ ]
34
+ },
35
+ "structure-prediction": {
36
+ "description": "predicting structural properties of the text, such as syntax",
37
+ "options": [
38
+ "coreference-resolution",
39
+ "named-entity-recognition",
40
+ "parsing",
41
+ "other"
42
+ ]
43
+ },
44
+ "text-classification": {
45
+ "description": "predicting a class index or boolean value",
46
+ "options": [
47
+ "acceptability-classification",
48
+ "entity-linking-classification",
49
+ "fact-checking",
50
+ "intent-classification",
51
+ "multi-class-classification",
52
+ "multi-label-classification",
53
+ "natural-language-inference",
54
+ "semantic-similarity-classification",
55
+ "sentiment-classification",
56
+ "topic-classification",
57
+ "other"
58
+ ]
59
+ },
60
+ "text-retrieval": {
61
+ "description": "information or text retrieval tasks",
62
+ "options": [
63
+ "document-retrieval",
64
+ "utterance-retrieval",
65
+ "entity-linking-retrieval",
66
+ "fact-checking-retrieval",
67
+ "other"
68
+ ]
69
+ },
70
+ "text-scoring": {
71
+ "description": "text scoring tasks, predicting a real valued score for some text",
72
+ "options": [
73
+ "semantic-similarity-scoring",
74
+ "sentiment-scoring",
75
+ "other"
76
+ ]
77
+ },
78
+ "other": {
79
+ "description": "other task family not mentioned here",
80
+ "options": [
81
+ "other"
82
+ ]
83
+ }
84
+ }