davanstrien HF staff commited on
Commit
5f7ffc5
·
1 Parent(s): de729bd

deal with samples properly

Browse files
Files changed (1) hide show
  1. app.py +23 -5
app.py CHANGED
@@ -136,14 +136,31 @@ def process_pdfs(
136
  progress(0, desc="Starting PDF processing")
137
  images, message = pdf_to_images(pdf_files, sample_size, images_dir)
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  zip_path = None
140
  if create_zip:
141
- # Create a zip file of the images
142
  zip_path = os.path.join(temp_dir, "converted_images.zip")
143
  with zipfile.ZipFile(zip_path, "w") as zipf:
144
  progress(0, desc="Zipping images")
145
  for image in progress.tqdm(images, desc="Zipping images"):
146
- zipf.write(image, os.path.basename(image))
 
 
 
147
  message += f"\nCreated zip file with {len(images)} images"
148
 
149
  if hf_repo:
@@ -154,11 +171,12 @@ def process_pdfs(
154
  repo_type="dataset",
155
  private=private_repo,
156
  )
157
- hf_api.upload_large_folder(
158
- folder_path=temp_dir,
 
159
  repo_id=hf_repo,
160
  repo_type="dataset",
161
- # path_in_repo="images",
162
  )
163
 
164
  # Determine size category
 
136
  progress(0, desc="Starting PDF processing")
137
  images, message = pdf_to_images(pdf_files, sample_size, images_dir)
138
 
139
+ # Create a new directory for sampled images
140
+ sampled_images_dir = os.path.join(temp_dir, "sampled_images")
141
+ os.makedirs(sampled_images_dir)
142
+
143
+ # Move sampled images to the new directory and update paths
144
+ updated_images = []
145
+ for image in images:
146
+ new_path = os.path.join(sampled_images_dir, os.path.basename(image))
147
+ shutil.move(image, new_path)
148
+ updated_images.append(new_path)
149
+
150
+ # Update the images list with new paths
151
+ images = updated_images
152
+
153
  zip_path = None
154
  if create_zip:
155
+ # Create a zip file of the sampled images
156
  zip_path = os.path.join(temp_dir, "converted_images.zip")
157
  with zipfile.ZipFile(zip_path, "w") as zipf:
158
  progress(0, desc="Zipping images")
159
  for image in progress.tqdm(images, desc="Zipping images"):
160
+ zipf.write(
161
+ os.path.join(sampled_images_dir, os.path.basename(image)),
162
+ os.path.basename(image),
163
+ )
164
  message += f"\nCreated zip file with {len(images)} images"
165
 
166
  if hf_repo:
 
171
  repo_type="dataset",
172
  private=private_repo,
173
  )
174
+ # Upload only the sampled images directory
175
+ hf_api.upload_folder(
176
+ folder_path=sampled_images_dir,
177
  repo_id=hf_repo,
178
  repo_type="dataset",
179
+ path_in_repo="images",
180
  )
181
 
182
  # Determine size category