evijit HF staff commited on
Commit
01a1e86
Β·
verified Β·
1 Parent(s): f3dbf69

Upload 3 files

Browse files
model_data/gemma-scorecard-json.json ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "Name": "Gemma 2",
4
+ "Provider": "Google",
5
+ "URL": "https://ai.google.dev/gemma/docs/model_card_2",
6
+ "Type": "Large Language Model",
7
+ "Modalities": [
8
+ "Text-to-Text"
9
+ ]
10
+ },
11
+ "scores": {
12
+ "1. Bias, Stereotypes, and Representational Harms Evaluation": {
13
+ "1.1 Bias Detection Overview": {
14
+ "status": "Yes",
15
+ "sources": [
16
+ {
17
+ "type": "🌐",
18
+ "detail": "https://ai.google.dev/gemma/docs/model_card_2#data_preprocessing",
19
+ "name": "Model Card - Data Preprocessing"
20
+ },
21
+ {
22
+ "type": "🌐",
23
+ "detail": "https://developers.googleblog.com/en/gemma-explained-new-in-gemma-2/",
24
+ "name": "Developer Blog"
25
+ },
26
+ {
27
+ "type": "🌐",
28
+ "detail": "https://arxiv.org/html/2410.12864",
29
+ "name": "Bias Analysis Paper"
30
+ }
31
+ ],
32
+ "questions": {
33
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true,
34
+ "Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)": true,
35
+ "Have extrinsic bias evaluations been run (e.g., downstream task performance)": true,
36
+ "Have evaluations been run across all applicable modalities": true,
37
+ "Have bias evaluations been run that take the form of automatic quantitative evaluation": true,
38
+ "Have bias evaluations been run with human participants?": true
39
+ }
40
+ },
41
+ "1.2 Protected Classes and Intersectional Measures": {
42
+ "status": "Yes",
43
+ "sources": [
44
+ {
45
+ "type": "🌐",
46
+ "detail": "https://ai.google.dev/gemma/docs/model_card_2#evaluation_results",
47
+ "name": "Model Card - Evaluation Results"
48
+ }
49
+ ],
50
+ "questions": {
51
+ "Do evaluations cover all applicable legal protected categories for in-scope uses of the system?": true,
52
+ "Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics": false,
53
+ "Evaluation of how different aspects of identity interact and compound in AI system behavior": false,
54
+ "Evaluation of AI system biases for legal protected categories and additional relevant subgroups": false
55
+ }
56
+ },
57
+ "1.3 Measurement of Stereotypes and Harmful Associations": {
58
+ "status": "Yes",
59
+ "sources": [
60
+ {
61
+ "type": "🌐",
62
+ "detail": "https://arxiv.org/abs/2009.11462",
63
+ "name": "Stereotype Analysis"
64
+ }
65
+ ],
66
+ "questions": {
67
+ "Measurement of known stereotypes in AI system outputs": true,
68
+ "Measurement of other negative associations and assumptions regarding specific groups": true,
69
+ "Measurement of stereotypes and negative associations across in-scope contexts": false
70
+ }
71
+ },
72
+ "1.4 Bias Evaluation Transparency and Documentation": {
73
+ "status": "Yes",
74
+ "sources": [
75
+ {
76
+ "type": "🌐",
77
+ "detail": "https://arxiv.org/pdf/2403.13793",
78
+ "name": "Evaluation Documentation"
79
+ }
80
+ ],
81
+ "questions": {
82
+ "Sufficient documentation of evaluation method to understand the scope of the findings": false,
83
+ "Sufficient documentation of evaluation methods to replicate findings": true,
84
+ "Sufficient documentation of evaluation results to support comparison": true,
85
+ "Documentation of bias mitigation measures": false,
86
+ "Documentation of bias monitoring approaches": false
87
+ }
88
+ }
89
+ },
90
+ "2. Cultural Values and Sensitive Content Evaluation": {
91
+ "2.1 Cultural Variation Overview": {
92
+ "status": "Yes",
93
+ "sources": [
94
+ {
95
+ "type": "🌐",
96
+ "detail": "https://aclanthology.org/2024.findings-emnlp.942.pdf",
97
+ "name": "Cultural Variation Analysis"
98
+ }
99
+ ],
100
+ "questions": {
101
+ "Evaluations at various stages": false,
102
+ "Have intrinsic properties been evaluated for cultural variation": false,
103
+ "Have extrinsic cultural variation evaluations been run": true,
104
+ "Have evaluations been run across all applicable modalities": true,
105
+ "Have cultural variation evaluations been run that take the form of automatic quantitative evaluation": true,
106
+ "Have cultural variation evaluations been run with human participants?": false
107
+ }
108
+ },
109
+ "2.2 Cultural Diversity and Representation": {
110
+ "status": "N/A",
111
+ "sources": [],
112
+ "questions": {
113
+ "Use of evaluation methods developed in the cultural contexts in scope": false,
114
+ "Respect of indigenous sovereignty, protected rights, and cultural norms": false,
115
+ "Evaluation of cultural variation across geographic dimensions": false,
116
+ "Evaluation of cultural variation representing communities' perspectives": false,
117
+ "Analysis of how cultural context affects AI system performance": false
118
+ }
119
+ },
120
+ "2.3 Generated Sensitive Content across Cultural Contexts": {
121
+ "status": "Yes",
122
+ "sources": [
123
+ {
124
+ "type": "🌐",
125
+ "detail": "https://arxiv.org/html/2408.00118v1#S6",
126
+ "name": "Content Safety Analysis"
127
+ }
128
+ ],
129
+ "questions": {
130
+ "Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true,
131
+ "Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false,
132
+ "Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false,
133
+ "Has the AI system been evaluated for content embedding values not reflective of user cultural context": false,
134
+ "Has the AI system been evaluated for exposing users to inappropriate content": false,
135
+ "Has the AI system been evaluated for content with negative psychological impacts": true,
136
+ "Has the evaluation explicitly addressed cultural variation": false
137
+ }
138
+ },
139
+ "2.4 Cultural Variation Transparency and Documentation": {
140
+ "status": "N/A",
141
+ "sources": [],
142
+ "questions": {
143
+ "Documentation of cultural contexts considered during development": false,
144
+ "Documentation of cultural contexts covered by evaluations": false,
145
+ "Sufficient documentation of evaluation method": false,
146
+ "Sufficient documentation of evaluation methods to replicate findings": false,
147
+ "Sufficient documentation of evaluation results": false,
148
+ "Documentation of psychological impact on evaluators": false,
149
+ "Documentation of evaluator well-being measures": false
150
+ }
151
+ }
152
+ },
153
+ "3. Disparate Performance": {
154
+ "3.1 Disparate Performance Overview": {
155
+ "status": "N/A",
156
+ "sources": [],
157
+ "questions": {
158
+ "Have development choices been evaluated for disparate performance contribution": false,
159
+ "Have extrinsic disparate performance evaluations been run": false,
160
+ "Have evaluations been run across all applicable modalities": false,
161
+ "Have disparate performance evaluations been run quantitatively": false,
162
+ "Have disparate performance evaluations been run with human participants": false
163
+ }
164
+ },
165
+ "3.2 Identifying Target Groups": {
166
+ "status": "N/A",
167
+ "sources": [],
168
+ "questions": {
169
+ "Identification of mandated target groups": false,
170
+ "Identification of additional potentially harmed groups": false,
171
+ "Assessment of systemic barriers in data collection": false,
172
+ "Consideration of historical disparities": false,
173
+ "Identification of implicit and explicit markers": false
174
+ }
175
+ },
176
+ "3.3 Subgroup Performance Analysis": {
177
+ "status": "N/A",
178
+ "sources": [],
179
+ "questions": {
180
+ "Non-aggregated evaluation results across subpopulations": false,
181
+ "Metrics for decision-making tasks": false,
182
+ "Metrics for other tasks including generative": false,
183
+ "Worst-case subgroup performance analysis": false,
184
+ "Intersectional analysis": false,
185
+ "Evaluation of implicit social group markers": false
186
+ }
187
+ },
188
+ "3.4 Transparency and Documentation": {
189
+ "status": "N/A",
190
+ "sources": [],
191
+ "questions": {
192
+ "Documentation of evaluation method scope": false,
193
+ "Documentation of evaluation methods for replication": false,
194
+ "Documentation of evaluation results for comparison": false,
195
+ "Documentation of mitigation measures": false,
196
+ "Documentation of monitoring approaches": false
197
+ }
198
+ }
199
+ },
200
+ "4. Environmental Costs and Carbon Emissions Evaluation": {
201
+ "4.1 Environmental Costs Overview": {
202
+ "status": "N/A",
203
+ "sources": [],
204
+ "questions": {
205
+ "Evaluations of different processes": false,
206
+ "Evaluations across modalities": false,
207
+ "Evaluations on standardized benchmarks": false,
208
+ "Community feedback consideration": false,
209
+ "Full supply chain consideration": false
210
+ }
211
+ },
212
+ "4.2 Development Impact": {
213
+ "status": "N/A",
214
+ "sources": [],
215
+ "questions": {
216
+ "FLOPS accounting": false,
217
+ "Energy consumption evaluation": false,
218
+ "Carbon impact evaluation": false,
219
+ "Hardware lifecycle evaluation": false
220
+ }
221
+ },
222
+ "4.3 Deployment Impact": {
223
+ "status": "Yes",
224
+ "sources": [
225
+ {
226
+ "type": "🌐",
227
+ "detail": "https://cloud.google.com/blog/products/ai-machine-learning/performance-deepdive-of-gemma-on-google-cloud",
228
+ "name": "Performance Analysis"
229
+ }
230
+ ],
231
+ "questions": {
232
+ "Evaluation of inference FLOPS": true,
233
+ "Evaluation of common deployment energy consumption": false,
234
+ "Evaluation across deployment settings": false,
235
+ "Evaluation of task-specific variations": false,
236
+ "Evaluation of deployment carbon impact": false,
237
+ "Evaluation of deployment hardware lifecycle": false
238
+ }
239
+ },
240
+ "4.4 Documentation": {
241
+ "status": "N/A",
242
+ "sources": [],
243
+ "questions": {
244
+ "Equipment and infrastructure documentation": false,
245
+ "Evaluation methods documentation": false,
246
+ "Results documentation": false,
247
+ "Documentation for comparison": false
248
+ }
249
+ }
250
+ },
251
+ "5. Privacy and Data Protection Evaluation": {
252
+ "5.1 Overview": {
253
+ "status": "Yes",
254
+ "sources": [
255
+ {
256
+ "type": "🌐",
257
+ "detail": "https://arxiv.org/pdf/2408.00118",
258
+ "name": "Privacy Evaluation"
259
+ }
260
+ ],
261
+ "questions": {
262
+ "Evaluations at various stages": true,
263
+ "Intrinsic privacy vulnerability evaluation": false,
264
+ "Extrinsic privacy evaluations": true,
265
+ "Evaluations across modalities": false,
266
+ "Quantitative privacy evaluations": true,
267
+ "Human participant privacy evaluations": false
268
+ }
269
+ },
270
+ "5.2 Privacy Harms": {
271
+ "status": "Yes",
272
+ "sources": [
273
+ {
274
+ "type": "🌐",
275
+ "detail": "https://arxiv.org/pdf/2408.00118",
276
+ "name": "Privacy Analysis"
277
+ }
278
+ ],
279
+ "questions": {
280
+ "Personal information revelation evaluation": true,
281
+ "Content impersonation evaluation": true,
282
+ "Personal information confabulation evaluation": true
283
+ }
284
+ },
285
+ "5.3 IP and Security": {
286
+ "status": "Yes",
287
+ "sources": [
288
+ {
289
+ "type": "🌐",
290
+ "detail": "https://www.cio.com/article/3567106/latticeflow-launches-first-comprehensive-evaluation-framework-for-compliance-with-the-eu-ai-act.html",
291
+ "name": "Security Evaluation"
292
+ }
293
+ ],
294
+ "questions": {
295
+ "Training data reproduction evaluation": true,
296
+ "Information security risk evaluation": false
297
+ }
298
+ },
299
+ "5.4 Documentation": {
300
+ "status": "Yes",
301
+ "sources": [
302
+ {
303
+ "type": "🌐",
304
+ "detail": "https://ai.google.dev/gemma/docs/model_card_2",
305
+ "name": "Model Card Documentation"
306
+ }
307
+ ],
308
+ "questions": {
309
+ "Evaluation methods documentation": false,
310
+ "Results documentation": false,
311
+ "Limitations documentation": true,
312
+ "Deployment considerations documentation": false,
313
+ "Training data documentation": false
314
+ }
315
+ }
316
+ },
317
+ "6. Financial Costs Evaluation": {
318
+ "6.1 Overview": {
319
+ "status": "N/A",
320
+ "sources": [],
321
+ "questions": {
322
+ "Cost evaluation across stages": false,
323
+ "Component cost evaluation": false,
324
+ "Modality cost evaluation": false,
325
+ "Direct and indirect expense evaluation": false,
326
+ "Cost projection validation": false
327
+ }
328
+ },
329
+ "6.2 Development Costs": {
330
+ "status": "N/A",
331
+ "sources": [],
332
+ "questions": {
333
+ "R&D labor costs": false,
334
+ "Data collection costs": false,
335
+ "Infrastructure costs": false,
336
+ "Training approach costs": false,
337
+ "Architecture impact costs": false
338
+ }
339
+ },
340
+ "6.3 Operation Costs": {
341
+ "status": "N/A",
342
+ "sources": [],
343
+ "questions": {
344
+ "Inference costs": false,
345
+ "Storage costs": false,
346
+ "Scaling costs": false,
347
+ "Deployment context costs": false,
348
+ "Update costs": false
349
+ }
350
+ },
351
+ "6.4 Documentation": {
352
+ "status": "N/A",
353
+ "sources": [],
354
+ "questions": {
355
+ "Methodology documentation": false,
356
+ "Cost breakdown documentation": false,
357
+ "Usage scenario documentation": false,
358
+ "Projection documentation": false
359
+ }
360
+ }
361
+ },
362
+ "7. Data and Content Moderation Labor Evaluation": {
363
+ "7.1 Overview": {
364
+ "status": "N/A",
365
+ "sources": [],
366
+ "questions": {
367
+ "Labor practice evaluation": false,
368
+ "Worker category evaluation": false,
369
+ "Task type evaluation": false,
370
+ "Industry standard evaluation": false,
371
+ "Worker type evaluation": false,
372
+ "Regional context evaluation": false
373
+ }
374
+ },
375
+ "7.2 Working Conditions": {
376
+ "status": "N/A",
377
+ "sources": [],
378
+ "questions": {
379
+ "Compensation assessment": false,
380
+ "Job security assessment": false,
381
+ "Workplace safety evaluation": false,
382
+ "Worker autonomy assessment": false,
383
+ "Power dynamics evaluation": false
384
+ }
385
+ },
386
+ "7.3 Worker Wellbeing": {
387
+ "status": "N/A",
388
+ "sources": [],
389
+ "questions": {
390
+ "Support system assessment": false,
391
+ "Content preparation evaluation": false,
392
+ "Cultural support evaluation": false
393
+ }
394
+ },
395
+ "7.4 Documentation": {
396
+ "status": "N/A",
397
+ "sources": [],
398
+ "questions": {
399
+ "Methodology documentation": false,
400
+ "Demographics documentation": false,
401
+ "Support system documentation": false,
402
+ "Incident reporting documentation": false
403
+ }
404
+ }
405
+ }
406
+ }
407
+ }
model_data/starcoder2_scorecard.json ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "Name": "StarCoder2",
4
+ "Provider": "BigCode",
5
+ "URL": "https://huggingface.co/bigcode/starcoder2-15b",
6
+ "Type": "Large Language Model",
7
+ "Modalities": [
8
+ "Text-to-Text"
9
+ ]
10
+ },
11
+ "scores": {
12
+ "1. Bias, Stereotypes, and Representational Harms Evaluation": {
13
+ "1.1 Bias Detection Overview": {
14
+ "status": "Yes",
15
+ "sources": [
16
+ {
17
+ "type": "🌐",
18
+ "detail": "https://arxiv.org/abs/2402.19173",
19
+ "name": "BOLD - Bias in Open-ended Language Generation Dataset"
20
+ },
21
+ {
22
+ "type": "🌐",
23
+ "detail": "https://arxiv.org/abs/2402.19173",
24
+ "name": "WinoBias"
25
+ }
26
+ ],
27
+ "questions": {
28
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
29
+ "Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)": false,
30
+ "Have extrinsic bias evaluations been run (e.g., downstream task performance)": true,
31
+ "Have evaluations been run across all applicable modalities": true,
32
+ "Have bias evaluations been run that take the form of automatic quantitative evaluation": true,
33
+ "Have bias evaluations been run with human participants?": false
34
+ }
35
+ },
36
+ "1.2 Protected Classes and Intersectional Measures": {
37
+ "status": "No",
38
+ "sources": [],
39
+ "questions": {
40
+ "Do evaluations cover all applicable legal protected categories for in-scope uses of the system?": false,
41
+ "Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics": false,
42
+ "Evaluation of how different aspects of identity interact and compound in AI system behavior": false,
43
+ "Evaluation of AI system biases for legal protected categories and additional relevant subgroups": false
44
+ }
45
+ },
46
+ "1.3 Measurement of Stereotypes and Harmful Associations": {
47
+ "status": "Yes",
48
+ "sources": [
49
+ {
50
+ "type": "🌐",
51
+ "detail": "https://arxiv.org/abs/2402.19173",
52
+ "name": "HONEST - Hurtful Sentence Completion in English Language Models"
53
+ },
54
+ {
55
+ "type": "🌐",
56
+ "detail": "https://arxiv.org/abs/2402.19173",
57
+ "name": "RealToxicityPrompts"
58
+ }
59
+ ],
60
+ "questions": {
61
+ "Measurement of known stereotypes in AI system outputs": true,
62
+ "Measurement of other negative associations and assumptions regarding specific groups": true,
63
+ "Measurement of stereotypes and negative associations across in-scope contexts": false
64
+ }
65
+ },
66
+ "1.4 Bias Evaluation Transparency and Documentation": {
67
+ "status": "Yes",
68
+ "sources": [
69
+ {
70
+ "type": "🌐",
71
+ "detail": "https://arxiv.org/abs/2402.19173",
72
+ "name": "Evaluation Documentation"
73
+ }
74
+ ],
75
+ "questions": {
76
+ "Sufficient documentation of evaluation methods (including code and datasets) to replicate findings": true,
77
+ "Sufficient documentation of evaluation results (including intermediary statistics) to support comparison to other AI systems": true,
78
+ "Documentation of bias mitigation measures, including their secondary impacts": false,
79
+ "Documentation of bias monitoring approaches post-release/deployment if applicable": false
80
+ }
81
+ }
82
+ },
83
+ "2. Cultural Values and Sensitive Content Evaluation": {
84
+ "2.1 Cultural Variation Overview": {
85
+ "status": "N/A",
86
+ "sources": [],
87
+ "questions": {
88
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": false,
89
+ "Have intrinsic properties of the AI system been evaluated for cultural variation(e.g., embedding analysis)": false,
90
+ "Have extrinsic cultural variation evaluations been run (e.g., downstream task performance)": false,
91
+ "Have evaluations been run across all applicable modalities": false,
92
+ "Have cultural variation evaluations been run that take the form of automatic quantitative evaluation": false,
93
+ "Have cultural variation evaluations been run with human participants?": false
94
+ }
95
+ },
96
+ "2.2 Cultural Diversity and Representation": {
97
+ "status": "N/A",
98
+ "sources": [],
99
+ "questions": {
100
+ "Use of evaluation methods developed in the cultural contexts in scope": false,
101
+ "Respect of indigenous sovereignty, protected rights, and cultural norms in AI system-generated content": false,
102
+ "Evaluation of cultural variation across geographic dimensions": false,
103
+ "Evaluation of cultural variation representing communities' perspectives within geographical contexts": false,
104
+ "Analysis of how cultural context affects AI system performance": false
105
+ }
106
+ },
107
+ "2.3 Generated Sensitive Content across Cultural Contexts": {
108
+ "status": "Yes",
109
+ "sources": [
110
+ {
111
+ "type": "🌐",
112
+ "detail": "https://arxiv.org/abs/2402.19173",
113
+ "name": "HONEST - Hurtful Sentence Completion in English Language Models"
114
+ },
115
+ {
116
+ "type": "🌐",
117
+ "detail": "https://arxiv.org/abs/2402.19173",
118
+ "name": "RealToxicityPrompts"
119
+ }
120
+ ],
121
+ "questions": {
122
+ "Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true,
123
+ "Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false,
124
+ "Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false,
125
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to content embedding values and assumptions not reflective of their cultural context": false,
126
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to inappropriate content for their use context": true,
127
+ "Has the AI system been evaluated for its likelihood of exposing its direct users to content with negative psychological impacts": false,
128
+ "Has the evaluation of the AI system's behaviors explicitly considered cultural variation in their definition": false
129
+ }
130
+ },
131
+ "2.4 Cultural Variation Transparency and Documentation": {
132
+ "status": "N/A",
133
+ "sources": [],
134
+ "questions": {
135
+ "Documentation of cultural contexts considered during development": false,
136
+ "Documentation of the range of cultural contexts covered by evaluations": false,
137
+ "Sufficient documentation of evaluation method to understand the scope of the findings": false,
138
+ "Construct validity, documentation of strengths, weaknesses, and assumptions": false,
139
+ "Domain shift between evaluation development and AI system development settings": false,
140
+ "Sufficient documentation of evaluation methods to replicate findings": false,
141
+ "Sufficient documentation of evaluation results to support comparison": false,
142
+ "Document of psychological impact on evaluators reviewing harmful content": false,
143
+ "Documentation of measures to protect evaluator well-being": false
144
+ }
145
+ }
146
+ },
147
+ "3. Disparate Performance Evaluation": {
148
+ "3.1 Disparate Performance Overview": {
149
+ "status": "N/A",
150
+ "sources": [],
151
+ "questions": {
152
+ "Have development choices and intrinsic properties of the AI system been evaluated for their contribution to disparate performance?": false,
153
+ "Have extrinsic disparate performance evaluations been run": false,
154
+ "Have evaluations been run across all applicable modalities": false,
155
+ "Have disparate performance evaluations been run that take the form of automatic quantitative evaluation": false,
156
+ "Have disparate performance evaluations been run with human participants": false
157
+ }
158
+ },
159
+ "3.2 Identifying Target Groups for Disparate Performance Evaluation": {
160
+ "status": "N/A",
161
+ "sources": [],
162
+ "questions": {
163
+ "Identification of mandated target group based on legal nondiscrimination frameworks": false,
164
+ "Identification of further target groups that are likely to be harmed by disparate performance": false,
165
+ "Assessment of systemic barriers in dataset collection methods for different groups": false,
166
+ "Consideration of historical disparities in the task in which the AI system is deployed": false,
167
+ "Identification of both implicit and explicit markers for the target groups": false
168
+ }
169
+ },
170
+ "3.3 Subgroup Performance Analysis": {
171
+ "status": "N/A",
172
+ "sources": [],
173
+ "questions": {
174
+ "Non-aggregated evaluation results across subpopulations, including feature importance and consistency analysis": false,
175
+ "Metrics to measure performance in decision-making tasks": false,
176
+ "Metrics to measure disparate performance in other tasks including generative tasks": false,
177
+ "Worst-case subgroup performance analysis, including performance on rare or underrepresented cases": false,
178
+ "Intersectional analysis examining performance across combinations of subgroup": false,
179
+ "Do evaluations of disparate performance account for implicit social group markers": false
180
+ }
181
+ },
182
+ "3.4 Disparate Performance Evaluation Transparency and Documentation": {
183
+ "status": "N/A",
184
+ "sources": [],
185
+ "questions": {
186
+ "Sufficient documentation of evaluation method to understand the scope of the findings": false,
187
+ "Documentation of strengths, weaknesses, and assumptions about the context": false,
188
+ "Documentation of domain shift between evaluation and deployment settings": false,
189
+ "Sufficient documentation of evaluation methods to replicate findings": false,
190
+ "Sufficient documentation of evaluation results to support comparison": false,
191
+ "Documentation of disparate performance mitigation measures": false,
192
+ "Documentation of disparate performance monitoring approaches": false
193
+ }
194
+ }
195
+ },
196
+ "4. Environmental Costs and Carbon Emissions Evaluation": {
197
+ "4.1 Environmental Costs Overview": {
198
+ "status": "Yes",
199
+ "sources": [
200
+ {
201
+ "type": "🌐",
202
+ "detail": "https://mlco2.github.io/impact/#compute",
203
+ "name": "Machine Learning Emissions Calculator"
204
+ }
205
+ ],
206
+ "questions": {
207
+ "Evaluations of different processes within development and deployment": false,
208
+ "Have evaluations been run across all applicable modalities?": true,
209
+ "Have evaluations been run on standardized benchmarks or metrics?": true,
210
+ "Have evaluations taken into account community feedback from regions affected by data center power consumption?": false,
211
+ "Do evaluations consider the full supply chain including environmental impact of hardware components and data centers used?": false
212
+ }
213
+ },
214
+ "4.2 Energy Cost and Environmental Impact of Development": {
215
+ "status": "Yes",
216
+ "sources": [
217
+ {
218
+ "type": "🌐",
219
+ "detail": "https://mlco2.github.io/impact/#compute",
220
+ "name": "Machine Learning Emissions Calculator"
221
+ }
222
+ ],
223
+ "questions": {
224
+ "Accounting of FLOPS across development stages": true,
225
+ "Evaluation of energy consumption using standardized tracking tools": true,
226
+ "Evaluation of carbon impact accounting for regional energy sources": true,
227
+ "Evaluation of hardware lifecycle environmental impact": false
228
+ }
229
+ },
230
+ "4.3 Energy Cost and Environmental Impact of Deployment": {
231
+ "status": "N/A",
232
+ "sources": [],
233
+ "questions": {
234
+ "Evaluation of inference FLOPS for the system": false,
235
+ "Evaluation of inference energy consumption on most common deployment setting": false,
236
+ "Evaluation of inference energy consumption on multiple deployment settings": false,
237
+ "Evaluation of task-specific energy consumption variations": false,
238
+ "Evaluation of carbon impact for deployment infrastructure": false,
239
+ "Evaluation of hardware lifecycle environmental impact for deployment": false
240
+ }
241
+ },
242
+ "4.4 Environmental Costs Transparency and Documentation": {
243
+ "status": "Yes",
244
+ "sources": [
245
+ {
246
+ "type": "🌐",
247
+ "detail": "https://mlco2.github.io/impact/#compute",
248
+ "name": "Machine Learning Emissions Calculator"
249
+ }
250
+ ],
251
+ "questions": {
252
+ "Documentation about equipment and infrastructure specifications": true,
253
+ "Sufficient documentation of evaluation methods including components covered": false,
254
+ "Sufficient documentation of evaluation methods to replicate findings": true,
255
+ "Sufficient documentation of evaluation results for comparison": true
256
+ }
257
+ }
258
+ },
259
+ "5. Privacy and Data Protection Evaluation": {
260
+ "5.1 Privacy and Data Protection Overview": {
261
+ "status": "Yes",
262
+ "sources": [
263
+ {
264
+ "type": "🏒",
265
+ "detail": "PII detection and redaction using an NER model"
266
+ },
267
+ {
268
+ "type": "🌐",
269
+ "detail": "https://huggingface.co/spaces/bigcode/in-the-stack",
270
+ "name": "Opt-out tool for users"
271
+ },
272
+ {
273
+ "type": "🌐",
274
+ "detail": "https://arxiv.org/abs/2402.19173",
275
+ "name": "Asleep at the Keyboard Security Benchmark"
276
+ }
277
+ ],
278
+ "questions": {
279
+ "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true,
280
+ "Have intrinsic properties of the AI system been evaluated for privacy vulnerabilities": false,
281
+ "Have extrinsic privacy evaluations been run": true,
282
+ "Have evaluations been run across all applicable modalities": true,
283
+ "Have privacy evaluations been run that take the form of automatic quantitative evaluation": true,
284
+ "Have privacy evaluations been run with human participants?": false
285
+ }
286
+ },
287
+ "5.2 Privacy, Likeness, and Publicity Harms": {
288
+ "status": "N/A",
289
+ "sources": [],
290
+ "questions": {
291
+ "Has the AI system been evaluated for its likelihood of revealing personal information from its training data?": false,
292
+ "Has the AI system been evaluated for its likelihood of facilitating generation of content impersonating an individual?": false,
293
+ "Has the AI system been evaluated for its likelihood of providing made up or confabulated personal information about individuals?": false
294
+ }
295
+ },
296
+ "5.3 Intellectual Property and Information Security": {
297
+ "status": "Yes",
298
+ "sources": [
299
+ {
300
+ "type": "🏒",
301
+ "detail": "Membership test to find if generated code was copied from the training corpus"
302
+ },
303
+ {
304
+ "type": "🏒",
305
+ "detail": "Code attribution tool to find the original author and license of the generated code"
306
+ },
307
+ {
308
+ "type": "🌐",
309
+ "detail": "https://arxiv.org/abs/2402.19173",
310
+ "name": "Asleep at the Keyboard Security Benchmark"
311
+ }
312
+ ],
313
+ "questions": {
314
+ "Has the AI system been evaluated for its likelihood of reproducing other categories of information from its training data": true,
315
+ "Has the system been evaluated for other information security risks for in-scope uses": false
316
+ }
317
+ },
318
+ "5.4 Privacy Evaluation Transparency and Documentation": {
319
+ "status": "Yes",
320
+ "sources": [
321
+ {
322
+ "type": "🏒",
323
+ "detail": "Documentation of training data information risk categories and consent status"
324
+ }
325
+ ],
326
+ "questions": {
327
+ "Documentation of the categories of training data that present information risk": true,
328
+ "Documentation of evaluation methods to replicate findings": true,
329
+ "Documentation of evaluation results to support comparison": true,
330
+ "Documentation of evaluation limitations": false,
331
+ "Documentation of deployment considerations": false
332
+ }
333
+ }
334
+ },
335
+ "6. Financial Costs Evaluation": {
336
+ "6.1 Financial Costs Overview": {
337
+ "status": "N/A",
338
+ "sources": [],
339
+ "questions": {
340
+ "Evaluation of costs at various stages": false,
341
+ "Have costs been evaluated for different system components": false,
342
+ "Have cost evaluations been run across all applicable modalities": false,
343
+ "Have cost evaluations included both direct and indirect expenses": false,
344
+ "Have cost projections been validated against actual expenses": false
345
+ }
346
+ },
347
+ "6.2 Development and Training Costs": {
348
+ "status": "N/A",
349
+ "sources": [],
350
+ "questions": {
351
+ "Assessment of research and development labor costs": false,
352
+ "Evaluation of data collection and preprocessing costs": false,
353
+ "Assessment of training infrastructure costs": false,
354
+ "Assessment of costs associated with different training approaches": false,
355
+ "Evaluation of model architecture and size impact on costs": false
356
+ }
357
+ },
358
+ "6.3 Deployment and Operation Costs": {
359
+ "status": "N/A",
360
+ "sources": [],
361
+ "questions": {
362
+ "Assessment of inference and serving costs": false,
363
+ "Evaluation of storage and hosting expenses": false,
364
+ "Assessment of scaling costs based on usage patterns": false,
365
+ "Evaluation of costs specific to different deployment contexts": false,
366
+ "Assessment of costs for model updates or fine-tuning by end users": false
367
+ }
368
+ },
369
+ "6.4 Financial Cost Documentation and Transparency": {
370
+ "status": "N/A",
371
+ "sources": [],
372
+ "questions": {
373
+ "Sufficient documentation of cost evaluation methodology and assumptions": false,
374
+ "Sufficient documentation of cost breakdowns and metrics": false,
375
+ "Documentation of cost variations across different usage scenarios": false,
376
+ "Documentation of long-term cost projections and risk factors": false
377
+ }
378
+ }
379
+ },
380
+ "7. Data and Content Moderation Labor Evaluation": {
381
+ "7.1 Labor Evaluation Overview": {
382
+ "status": "Yes",
383
+ "sources": [
384
+ {
385
+ "type": "🏒",
386
+ "detail": "PII annotations by human annotators with fair wage"
387
+ }
388
+ ],
389
+ "questions": {
390
+ "Evaluation of labor practices at various stages": true,
391
+ "Have labor conditions been evaluated for different worker categories": true,
392
+ "Have labor evaluations been run across all applicable task types": false,
393
+ "Have labor practices been evaluated against established industry standards": true,
394
+ "Have labor evaluations included both direct employees and contracted workers": false,
395
+ "Have evaluations considered different regional and jurisdictional contexts": true
396
+ }
397
+ },
398
+ "7.2 Working Conditions and Compensation": {
399
+ "status": "Yes",
400
+ "sources": [
401
+ {
402
+ "type": "🏒",
403
+ "detail": "PII annotations by human annotators with fair wage"
404
+ }
405
+ ],
406
+ "questions": {
407
+ "Assessment of compensation relative to local living wages and industry standards": true,
408
+ "Assessment of job security and employment classification": false,
409
+ "Evaluation of workplace safety, worker protections and rights": false,
410
+ "Assessment of worker autonomy and task assignment practices": false,
411
+ "Evaluation of power dynamics and worker feedback mechanisms": false
412
+ }
413
+ },
414
+ "7.3 Worker Wellbeing and Support": {
415
+ "status": "N/A",
416
+ "sources": [],
417
+ "questions": {
418
+ "Assessment of psychological support systems, trauma resources, and other long-term mental health monitoring": false,
419
+ "Evaluation of training and preparation for difficult content": false,
420
+ "Evaluation of cultural and linguistic support for diverse workforces": false
421
+ }
422
+ },
423
+ "7.4 Labor Practice Documentation and Transparency": {
424
+ "status": "Yes",
425
+ "sources": [
426
+ {
427
+ "type": "🏒",
428
+ "detail": "PII annotations by human annotators with fair wage"
429
+ }
430
+ ],
431
+ "questions": {
432
+ "Documentation of labor evaluation methodology and frameworks used": true,
433
+ "Documentation of worker demographics and task distribution": false,
434
+ "Documentation of support systems, worker protections": false,
435
+ "Documentation of incident reporting and resolution procedures": false
436
+ }
437
+ }
438
+ }
439
+ }
440
+ }