OSainz commited on
Commit
fd6f269
Β·
1 Parent(s): 540407e

Small changes

Browse files
Files changed (4) hide show
  1. app.py +3 -1
  2. contamination_report.csv +1 -4
  3. dataset.py +14 -3
  4. utils.py +3 -0
app.py CHANGED
@@ -38,8 +38,10 @@ def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes):
38
  | (dataframe["Test Split"] > 0.0)
39
  ]
40
 
 
 
41
  return dataframe.style.format(
42
- {"Train Split": "{:.1%}", "Development Split": "{:.1%}", "Test Split": "{:.1%}"}
43
  )
44
 
45
 
 
38
  | (dataframe["Test Split"] > 0.0)
39
  ]
40
 
41
+ dataframe = dataframe.sort_values("Test Split", ascending=False)
42
+
43
  return dataframe.style.format(
44
+ {"Train Split": "{:.1%}", "Development Split": "{:.1%}", "Test Split": "{:.1%}"}, na_rep="Unknown"
45
  )
46
 
47
 
contamination_report.csv CHANGED
@@ -1,4 +1 @@
1
- Evaluation Dataset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Citation;PR Link
2
- conll2003;google/gemma-7b;model;1.0;1.0;1.0;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;
3
- conll2003;EleutherAI/the_pile_deduplicated;corpus;1.0;1.0;1.0;data-based;https://aclanthology.org/2023.findings-emnlp.722/;www.google.com
4
- Test;lololol;corpus;1.0;1.0;1.0;data-based;https://arxiv.org/abs/2310.03668;
 
1
+ Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
 
 
 
dataset.py CHANGED
@@ -207,7 +207,7 @@ def get_dataframe():
207
  favicon_dict = {}
208
 
209
  # Update the favicon dictionary
210
- favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Citation"]])
211
 
212
  # Update the model url dictionary
213
  model_url_dict = update_model_url_cache(
@@ -221,7 +221,7 @@ def get_dataframe():
221
  )
222
 
223
  # Add favicons URLs to the dataframe in a vectorized manner
224
- data["Citation"] = data["Citation"].apply(
225
  lambda x: build_text_icon(
226
  text=get_domain_name(x),
227
  url=x,
@@ -229,7 +229,7 @@ def get_dataframe():
229
  )
230
  )
231
 
232
- data["PR Link"] = data["PR Link"].apply(
233
  lambda x: build_text_icon(
234
  text="",
235
  url=x if x == x else "no link",
@@ -245,6 +245,13 @@ def get_dataframe():
245
  )
246
  )
247
 
 
 
 
 
 
 
 
248
  # For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
249
  data["Contaminated Source"] = data.apply(
250
  lambda x: build_text_icon(
@@ -257,4 +264,8 @@ def get_dataframe():
257
  axis=1,
258
  )
259
 
 
 
 
 
260
  return data
 
207
  favicon_dict = {}
208
 
209
  # Update the favicon dictionary
210
+ favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]])
211
 
212
  # Update the model url dictionary
213
  model_url_dict = update_model_url_cache(
 
221
  )
222
 
223
  # Add favicons URLs to the dataframe in a vectorized manner
224
+ data["Reference"] = data["Reference"].apply(
225
  lambda x: build_text_icon(
226
  text=get_domain_name(x),
227
  url=x,
 
229
  )
230
  )
231
 
232
+ data["PR"] = data["PR"].apply(
233
  lambda x: build_text_icon(
234
  text="",
235
  url=x if x == x else "no link",
 
245
  )
246
  )
247
 
248
+ data["Evaluation Dataset"] = data.apply(
249
+ lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"],
250
+ axis=1,
251
+ )
252
+
253
+ del data["Subset"]
254
+
255
  # For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
256
  data["Contaminated Source"] = data.apply(
257
  lambda x: build_text_icon(
 
264
  axis=1,
265
  )
266
 
267
+ data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
268
+ data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
269
+ data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x)
270
+
271
  return data
utils.py CHANGED
@@ -38,6 +38,9 @@ def get_domain_name(url: str) -> str:
38
  domain = "{uri.netloc}".format(uri=parsed_uri)
39
  if domain.startswith("www."):
40
  domain = domain[4:]
 
 
 
41
  # First latter in uppercase
42
  return domain.capitalize()
43
 
 
38
  domain = "{uri.netloc}".format(uri=parsed_uri)
39
  if domain.startswith("www."):
40
  domain = domain[4:]
41
+
42
+ # Remove last domain
43
+ domain = ".".join(domain.split(".")[:-1])
44
  # First latter in uppercase
45
  return domain.capitalize()
46