File size: 5,322 Bytes
c134030 e587a3a 8436088 c134030 8436088 c134030 8436088 c134030 8436088 c134030 fda4281 c134030 8436088 c134030 8436088 fda4281 c134030 8436088 c134030 e587a3a 8436088 e587a3a c134030 e587a3a 8436088 e587a3a 8436088 e587a3a 8436088 c134030 e587a3a 8436088 e587a3a c134030 8436088 c134030 8436088 c134030 8436088 c134030 8436088 c134030 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from datasets import Dataset, Features, Value, Image
from huggingface_hub import HfApi
import os
from collections import defaultdict
import pandas as pd
import argparse
from PIL import Image as PILImage
import sys
import logging
def upload_to_dataset(original_images_dir, processed_images_dir, dataset_name, dry_run=False):
"""Upload images to a Hugging Face dataset including BiRefNet results."""
logging.info(f"Starting dataset upload from {original_images_dir}")
# Define the dataset features with dedicated columns for each model
features = Features({
"original_image": Image(),
"clipdrop_image": Image(),
"bria_image": Image(),
"photoroom_image": Image(),
"removebg_image": Image(),
"birefnet_image": Image(),
"original_filename": Value("string")
})
# Load image paths and metadata
data = defaultdict(lambda: {
"clipdrop_image": None,
"bria_image": None,
"photoroom_image": None,
"removebg_image": None,
"birefnet_image": None
})
# Walk into the original images folder
for root, _, files in os.walk(original_images_dir):
for f in files:
if f.endswith(('.png', '.jpg', '.jpeg', '.webp')):
original_image_path = os.path.join(root, f)
data[f]["original_image"] = original_image_path
data[f]["original_filename"] = f
# Check for corresponding images in processed directories
for source in ["clipdrop", "bria", "photoroom", "removebg", "birefnet"]:
for ext in ['.png', '.jpg', '.jpeg', '.webp']:
processed_image_filename = os.path.splitext(f)[0] + ext
source_image_path = os.path.join(processed_images_dir, source, processed_image_filename)
if os.path.exists(source_image_path):
data[f][f"{source}_image"] = source_image_path
break
# Convert the data to a dictionary of lists
dataset_dict = {
"original_image": [],
"clipdrop_image": [],
"bria_image": [],
"photoroom_image": [],
"removebg_image": [],
"birefnet_image": [],
"original_filename": []
}
errors = []
processed_count = 0
skipped_count = 0
for filename, entry in data.items():
if "original_image" in entry:
try:
original_size = PILImage.open(entry["original_image"]).size
valid_entry = True
for source in ["clipdrop_image", "bria_image", "photoroom_image", "removebg_image", "birefnet_image"]:
if entry[source] is not None:
try:
processed_size = PILImage.open(entry[source]).size
if processed_size != original_size:
errors.append(f"Size mismatch for {filename}: {source}")
valid_entry = False
except Exception as e:
errors.append(f"Error with {filename}: {source}")
valid_entry = False
if valid_entry:
for key in dataset_dict.keys():
if key in entry:
dataset_dict[key].append(entry[key])
processed_count += 1
else:
skipped_count += 1
except Exception as e:
errors.append(f"Error processing {filename}")
skipped_count += 1
if errors:
logging.warning(f"Encountered {len(errors)} errors during processing")
logging.info(f"Processed: {processed_count}, Skipped: {skipped_count}, Total: {processed_count + skipped_count}")
# Save the data dictionary to a CSV file for inspection
df = pd.DataFrame.from_dict(dataset_dict)
df.to_csv("image_data.csv", index=False)
# Create a Dataset
dataset = Dataset.from_dict(dataset_dict, features=features)
if dry_run:
logging.info("Dry run completed - dataset not pushed")
else:
logging.info(f"Pushing dataset to {dataset_name}")
api = HfApi()
dataset.push_to_hub(dataset_name, token=api.token, private=True)
logging.info("Upload completed successfully")
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
parser = argparse.ArgumentParser(description="Upload images to a Hugging Face dataset.")
parser.add_argument("original_images_dir", type=str, help="Directory containing the original images.")
parser.add_argument("processed_images_dir", type=str, help="Directory containing the processed images with subfolders for each model.")
parser.add_argument("dataset_name", type=str, help="Name of the dataset to upload to Hugging Face Hub.")
parser.add_argument("--dry-run", action="store_true", help="Perform a dry run without uploading to the hub.")
args = parser.parse_args()
upload_to_dataset(args.original_images_dir, args.processed_images_dir, args.dataset_name, dry_run=args.dry_run)
|