davanstrien HF staff commited on
Commit
3c15d19
·
1 Parent(s): 0ef7a85

update options

Browse files
Files changed (1) hide show
  1. app.py +69 -9
app.py CHANGED
@@ -3,11 +3,15 @@ import random
3
  import shutil
4
  import tempfile
5
  import zipfile
 
6
 
7
  import gradio as gr
8
- from huggingface_hub import HfApi
9
  from pdf2image import convert_from_path
10
  from PyPDF2 import PdfReader
 
 
 
11
 
12
 
13
  def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
@@ -48,10 +52,25 @@ def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
48
  return all_images, f"Saved {len(all_images)} images to temporary directory"
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def process_pdfs(
52
  pdf_files,
53
  sample_size,
54
  hf_repo,
 
 
55
  oauth_token: gr.OAuthToken | None,
56
  progress=gr.Progress(),
57
  ):
@@ -81,12 +100,15 @@ def process_pdfs(
81
  progress(0, desc="Starting PDF processing")
82
  images, message = pdf_to_images(pdf_files, sample_size, images_dir)
83
 
84
- # Create a zip file of the images
85
- zip_path = os.path.join(temp_dir, "converted_images.zip")
86
- with zipfile.ZipFile(zip_path, "w") as zipf:
87
- progress(0, desc="Zipping images")
88
- for image in progress.tqdm(images, desc="Zipping images"):
89
- zipf.write(image, os.path.basename(image))
 
 
 
90
 
91
  if hf_repo:
92
  try:
@@ -94,6 +116,7 @@ def process_pdfs(
94
  hf_api.create_repo(
95
  hf_repo,
96
  repo_type="dataset",
 
97
  )
98
  hf_api.upload_folder(
99
  folder_path=images_dir,
@@ -101,7 +124,41 @@ def process_pdfs(
101
  repo_type="dataset",
102
  path_in_repo="images",
103
  )
104
- message += f"\nUploaded images to Hugging Face repo: {hf_repo}/images"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  except Exception as e:
106
  message += f"\nFailed to upload to Hugging Face: {str(e)}"
107
 
@@ -140,6 +197,9 @@ with gr.Blocks() as demo:
140
  placeholder="username/repo-name",
141
  info="Enter the Hugging Face repository name in the format 'username/repo-name'",
142
  )
 
 
 
143
  with gr.Accordion("View converted images", open=False):
144
  output_gallery = gr.Gallery(label="Converted Images")
145
  status_text = gr.Markdown(label="Status")
@@ -148,7 +208,7 @@ with gr.Blocks() as demo:
148
  submit_button = gr.Button("Convert PDFs to page images")
149
  submit_button.click(
150
  process_pdfs,
151
- inputs=[pdf_files, sample_size, hf_repo],
152
  outputs=[output_gallery, download_button, status_text],
153
  )
154
 
 
3
  import shutil
4
  import tempfile
5
  import zipfile
6
+ from datetime import datetime
7
 
8
  import gradio as gr
9
+ from huggingface_hub import HfApi, DatasetCard, DatasetCardData
10
  from pdf2image import convert_from_path
11
  from PyPDF2 import PdfReader
12
+ from dataset_card_template import DATASET_CARD_TEMPLATE
13
+
14
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
15
 
16
 
17
  def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
 
52
  return all_images, f"Saved {len(all_images)} images to temporary directory"
53
 
54
 
55
+ def get_size_category(num_images):
56
+ if num_images < 1000:
57
+ return "n<1K"
58
+ elif num_images < 10000:
59
+ return "1K<n<10K"
60
+ elif num_images < 100000:
61
+ return "10K<n<100K"
62
+ elif num_images < 1000000:
63
+ return "100K<n<1M"
64
+ else:
65
+ return "n>1M"
66
+
67
+
68
  def process_pdfs(
69
  pdf_files,
70
  sample_size,
71
  hf_repo,
72
+ create_zip,
73
+ private_repo,
74
  oauth_token: gr.OAuthToken | None,
75
  progress=gr.Progress(),
76
  ):
 
100
  progress(0, desc="Starting PDF processing")
101
  images, message = pdf_to_images(pdf_files, sample_size, images_dir)
102
 
103
+ zip_path = None
104
+ if create_zip:
105
+ # Create a zip file of the images
106
+ zip_path = os.path.join(temp_dir, "converted_images.zip")
107
+ with zipfile.ZipFile(zip_path, "w") as zipf:
108
+ progress(0, desc="Zipping images")
109
+ for image in progress.tqdm(images, desc="Zipping images"):
110
+ zipf.write(image, os.path.basename(image))
111
+ message += f"\nCreated zip file with {len(images)} images"
112
 
113
  if hf_repo:
114
  try:
 
116
  hf_api.create_repo(
117
  hf_repo,
118
  repo_type="dataset",
119
+ private=private_repo,
120
  )
121
  hf_api.upload_folder(
122
  folder_path=images_dir,
 
124
  repo_type="dataset",
125
  path_in_repo="images",
126
  )
127
+
128
+ # Determine size category
129
+ size_category = get_size_category(len(images))
130
+
131
+ # Create DatasetCardData instance
132
+ card_data = DatasetCardData(
133
+ tags=["created-with-pdfs-to-page-images-converter", "pdf-to-image"],
134
+ size_categories=[size_category],
135
+ )
136
+
137
+ # Create and populate the dataset card
138
+ card = DatasetCard.from_template(
139
+ card_data,
140
+ template_path=None, # Use default template
141
+ hf_repo=hf_repo,
142
+ num_images=len(images),
143
+ num_pdfs=len(pdf_files),
144
+ sample_size=sample_size if sample_size > 0 else "All pages",
145
+ creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
146
+ )
147
+
148
+ # Add our custom content to the card
149
+ card.text = DATASET_CARD_TEMPLATE.format(
150
+ hf_repo=hf_repo,
151
+ num_images=len(images),
152
+ num_pdfs=len(pdf_files),
153
+ sample_size=sample_size if sample_size > 0 else "All pages",
154
+ creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
155
+ size_category=size_category,
156
+ )
157
+
158
+ repo_url = f"https://huggingface.co/datasets/{hf_repo}"
159
+ message += f"\nUploaded dataset card to Hugging Face repo: [{hf_repo}]({repo_url})"
160
+
161
+ card.push_to_hub(hf_repo)
162
  except Exception as e:
163
  message += f"\nFailed to upload to Hugging Face: {str(e)}"
164
 
 
197
  placeholder="username/repo-name",
198
  info="Enter the Hugging Face repository name in the format 'username/repo-name'",
199
  )
200
+ with gr.Row():
201
+ create_zip = gr.Checkbox(label="Create ZIP file of images?", value=False)
202
+ private_repo = gr.Checkbox(label="Make repository private?", value=False)
203
  with gr.Accordion("View converted images", open=False):
204
  output_gallery = gr.Gallery(label="Converted Images")
205
  status_text = gr.Markdown(label="Status")
 
208
  submit_button = gr.Button("Convert PDFs to page images")
209
  submit_button.click(
210
  process_pdfs,
211
+ inputs=[pdf_files, sample_size, hf_repo, create_zip, private_repo],
212
  outputs=[output_gallery, download_button, status_text],
213
  )
214