raunakdoesdev commited on
Commit
727cda5
Β·
1 Parent(s): 3708509
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ unzipped_dataset/
pages/1_πŸ”_Explorer.py β†’ 1_πŸ”_Explorer.py RENAMED
@@ -1,16 +1,41 @@
1
- from huggingface_hub import snapshot_download, hf_hub_download
2
  import streamlit as st
3
  import pandas as pd
4
  import matplotlib.pyplot as plt
5
  import os
 
 
6
 
7
  st.set_page_config(layout="wide")
8
 
9
- results = hf_hub_download(
10
- repo_id="reducto/rd-tablebench",
11
- filename="providers/scores.csv",
12
- repo_type="dataset",
13
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  st.html("""
16
  <style>
@@ -31,13 +56,6 @@ td, th {
31
  """)
32
 
33
 
34
- @st.cache_resource(show_spinner="Loading dataset (can take ~3 min)")
35
- def load_dataset_1():
36
- return snapshot_download(repo_id="reducto/rd-tablebench", repo_type="dataset")
37
-
38
-
39
- dataset = load_dataset_1()
40
-
41
  df = pd.read_csv(results)
42
 
43
  if "current_index" not in st.session_state:
@@ -121,6 +139,7 @@ with col2:
121
  image_path = f"{dataset}/_images/{row['pdf_path'].replace('.pdf', '.jpg')}"
122
  st.image(image_path, use_column_width=True)
123
 
 
124
  st.subheader("Groundtruth")
125
  st.html(f"{dataset}/groundtruth/{row['pdf_path'].replace('.pdf', '.html')}")
126
 
 
1
+ from huggingface_hub import hf_hub_download
2
  import streamlit as st
3
  import pandas as pd
4
  import matplotlib.pyplot as plt
5
  import os
6
+ import zipfile
7
+ import shutil
8
 
9
  st.set_page_config(layout="wide")
10
 
11
+ with st.spinner("Downloading dataset"):
12
+ results = hf_hub_download(
13
+ repo_id="reducto/rd-tablebench",
14
+ filename="rd-tablebench.zip",
15
+ repo_type="dataset",
16
+ )
17
+
18
+
19
+ def unzip_dataset():
20
+ if not os.path.exists("unzipped_dataset"):
21
+ os.makedirs("unzipped_dataset")
22
+ with st.spinner("Unzipping dataset"):
23
+ with zipfile.ZipFile(results, "r") as zip_ref:
24
+ zip_ref.extractall("unzipped_dataset")
25
+ return "unzipped_dataset/rd-tablebench"
26
+
27
+
28
+ if st.button("Redo Unzip"):
29
+ if os.path.exists("unzipped_dataset"):
30
+ shutil.rmtree("unzipped_dataset")
31
+ st.rerun()
32
+
33
+
34
+ dataset = unzip_dataset()
35
+
36
+ results = f"{dataset}/providers/scores.csv"
37
+
38
+ assert os.path.exists(results)
39
 
40
  st.html("""
41
  <style>
 
56
  """)
57
 
58
 
 
 
 
 
 
 
 
59
  df = pd.read_csv(results)
60
 
61
  if "current_index" not in st.session_state:
 
139
  image_path = f"{dataset}/_images/{row['pdf_path'].replace('.pdf', '.jpg')}"
140
  st.image(image_path, use_column_width=True)
141
 
142
+ st.write(row)
143
  st.subheader("Groundtruth")
144
  st.html(f"{dataset}/groundtruth/{row['pdf_path'].replace('.pdf', '.html')}")
145
 
README.md CHANGED
@@ -5,10 +5,8 @@ colorFrom: green
5
  colorTo: gray
6
  sdk: streamlit
7
  sdk_version: 1.39.0
8
- app_file: _πŸ“„_README.py
9
  pinned: false
10
- license: agpl-3.0
11
  short_description: Reducto's SOTA human annotated table benchmark.
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
5
  colorTo: gray
6
  sdk: streamlit
7
  sdk_version: 1.39.0
8
+ app_file: 1_πŸ”_Explorer.py
9
  pinned: false
10
+ license: cc-by-nc-nd-4.0
11
  short_description: Reducto's SOTA human annotated table benchmark.
12
+ ---
 
 
_πŸ“„_README.py DELETED
File without changes