Spaces:

reducto
/

rd_table_bench

Running

raunakdoesdev commited on Nov 5, 2024

Commit

727cda5

1 Parent(s): 3708509

simplify

Files changed (4) hide show

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ unzipped_dataset/

pages/1_🔍_Explorer.py → 1_🔍_Explorer.py RENAMED Viewed

@@ -1,16 +1,41 @@
-from huggingface_hub import snapshot_download, hf_hub_download
 import streamlit as st
 import pandas as pd
 import matplotlib.pyplot as plt
 import os
 st.set_page_config(layout="wide")
-results = hf_hub_download(
-    repo_id="reducto/rd-tablebench",
-    filename="providers/scores.csv",
-    repo_type="dataset",
-)
 st.html("""
 <style>
@@ -31,13 +56,6 @@ td, th {
 """)
-@st.cache_resource(show_spinner="Loading dataset (can take ~3 min)")
-def load_dataset_1():
-    return snapshot_download(repo_id="reducto/rd-tablebench", repo_type="dataset")
-dataset = load_dataset_1()
 df = pd.read_csv(results)
 if "current_index" not in st.session_state:
@@ -121,6 +139,7 @@ with col2:
     image_path = f"{dataset}/_images/{row['pdf_path'].replace('.pdf', '.jpg')}"
     st.image(image_path, use_column_width=True)
 st.subheader("Groundtruth")
 st.html(f"{dataset}/groundtruth/{row['pdf_path'].replace('.pdf', '.html')}")

+from huggingface_hub import hf_hub_download
 import streamlit as st
 import pandas as pd
 import matplotlib.pyplot as plt
 import os
+import zipfile
+import shutil
 st.set_page_config(layout="wide")
+with st.spinner("Downloading dataset"):
+    results = hf_hub_download(
+        repo_id="reducto/rd-tablebench",
+        filename="rd-tablebench.zip",
+        repo_type="dataset",
+    )
+def unzip_dataset():
+    if not os.path.exists("unzipped_dataset"):
+        os.makedirs("unzipped_dataset")
+        with st.spinner("Unzipping dataset"):
+            with zipfile.ZipFile(results, "r") as zip_ref:
+                zip_ref.extractall("unzipped_dataset")
+    return "unzipped_dataset/rd-tablebench"
+if st.button("Redo Unzip"):
+    if os.path.exists("unzipped_dataset"):
+        shutil.rmtree("unzipped_dataset")
+        st.rerun()
+dataset = unzip_dataset()
+results = f"{dataset}/providers/scores.csv"
+assert os.path.exists(results)
 st.html("""
 <style>
 """)
 df = pd.read_csv(results)
 if "current_index" not in st.session_state:
     image_path = f"{dataset}/_images/{row['pdf_path'].replace('.pdf', '.jpg')}"
     st.image(image_path, use_column_width=True)
+st.write(row)
 st.subheader("Groundtruth")
 st.html(f"{dataset}/groundtruth/{row['pdf_path'].replace('.pdf', '.html')}")

README.md CHANGED Viewed

@@ -5,10 +5,8 @@ colorFrom: green
 colorTo: gray
 sdk: streamlit
 sdk_version: 1.39.0
-app_file: _📄_README.py
 pinned: false
-license: agpl-3.0
 short_description: Reducto's SOTA human annotated table benchmark.
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorTo: gray
 sdk: streamlit
 sdk_version: 1.39.0
+app_file: 1_🔍_Explorer.py
 pinned: false
+license: cc-by-nc-nd-4.0
 short_description: Reducto's SOTA human annotated table benchmark.
+---

_📄_README.py DELETED Viewed

File without changes