File size: 5,322 Bytes
c134030
 
 
 
 
 
e587a3a
 
8436088
c134030
 
8436088
 
 
 
c134030
 
8436088
 
 
 
 
 
 
c134030
 
 
 
 
 
 
8436088
 
c134030
 
 
 
 
fda4281
c134030
 
 
 
 
8436088
 
c134030
 
 
 
 
8436088
fda4281
c134030
 
 
 
 
 
 
8436088
c134030
 
 
e587a3a
8436088
 
e587a3a
c134030
 
e587a3a
 
8436088
 
 
e587a3a
8436088
 
 
 
 
 
 
 
e587a3a
8436088
 
 
 
 
 
 
 
 
 
 
c134030
e587a3a
8436088
 
 
e587a3a
c134030
 
 
 
 
 
 
 
8436088
c134030
8436088
c134030
 
8436088
c134030
 
8436088
 
 
 
 
 
c134030
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from datasets import Dataset, Features, Value, Image
from huggingface_hub import HfApi
import os
from collections import defaultdict
import pandas as pd
import argparse
from PIL import Image as PILImage
import sys
import logging

def upload_to_dataset(original_images_dir, processed_images_dir, dataset_name, dry_run=False):
    """Upload images to a Hugging Face dataset including BiRefNet results."""
    
    logging.info(f"Starting dataset upload from {original_images_dir}")
    
    # Define the dataset features with dedicated columns for each model
    features = Features({
        "original_image": Image(),
        "clipdrop_image": Image(),
        "bria_image": Image(),
        "photoroom_image": Image(),
        "removebg_image": Image(),
        "birefnet_image": Image(),
        "original_filename": Value("string")
    })

    # Load image paths and metadata
    data = defaultdict(lambda: {
        "clipdrop_image": None,
        "bria_image": None,
        "photoroom_image": None,
        "removebg_image": None,
        "birefnet_image": None
    })

    # Walk into the original images folder
    for root, _, files in os.walk(original_images_dir):
        for f in files:
            if f.endswith(('.png', '.jpg', '.jpeg', '.webp')):
                original_image_path = os.path.join(root, f)
                data[f]["original_image"] = original_image_path
                data[f]["original_filename"] = f

                # Check for corresponding images in processed directories
                for source in ["clipdrop", "bria", "photoroom", "removebg", "birefnet"]:
                    for ext in ['.png', '.jpg', '.jpeg', '.webp']:
                        processed_image_filename = os.path.splitext(f)[0] + ext
                        source_image_path = os.path.join(processed_images_dir, source, processed_image_filename)
        
                        if os.path.exists(source_image_path):
                            data[f][f"{source}_image"] = source_image_path
                            break

    # Convert the data to a dictionary of lists
    dataset_dict = {
        "original_image": [],
        "clipdrop_image": [],
        "bria_image": [],
        "photoroom_image": [],
        "removebg_image": [],
        "birefnet_image": [],
        "original_filename": []
    }

    errors = []
    processed_count = 0
    skipped_count = 0

    for filename, entry in data.items():
        if "original_image" in entry:
            try:
                original_size = PILImage.open(entry["original_image"]).size
                valid_entry = True

                for source in ["clipdrop_image", "bria_image", "photoroom_image", "removebg_image", "birefnet_image"]:
                    if entry[source] is not None:
                        try:
                            processed_size = PILImage.open(entry[source]).size
                            if processed_size != original_size:
                                errors.append(f"Size mismatch for {filename}: {source}")
                                valid_entry = False
                        except Exception as e:
                            errors.append(f"Error with {filename}: {source}")
                            valid_entry = False

                if valid_entry:
                    for key in dataset_dict.keys():
                        if key in entry:
                            dataset_dict[key].append(entry[key])
                    processed_count += 1
                else:
                    skipped_count += 1

            except Exception as e:
                errors.append(f"Error processing {filename}")
                skipped_count += 1

    if errors:
        logging.warning(f"Encountered {len(errors)} errors during processing")

    logging.info(f"Processed: {processed_count}, Skipped: {skipped_count}, Total: {processed_count + skipped_count}")

    # Save the data dictionary to a CSV file for inspection
    df = pd.DataFrame.from_dict(dataset_dict)
    df.to_csv("image_data.csv", index=False)

    # Create a Dataset
    dataset = Dataset.from_dict(dataset_dict, features=features)

    if dry_run:
        logging.info("Dry run completed - dataset not pushed")
    else:
        logging.info(f"Pushing dataset to {dataset_name}")
        api = HfApi()
        dataset.push_to_hub(dataset_name, token=api.token, private=True)
        logging.info("Upload completed successfully")

if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    
    parser = argparse.ArgumentParser(description="Upload images to a Hugging Face dataset.")
    parser.add_argument("original_images_dir", type=str, help="Directory containing the original images.")
    parser.add_argument("processed_images_dir", type=str, help="Directory containing the processed images with subfolders for each model.")
    parser.add_argument("dataset_name", type=str, help="Name of the dataset to upload to Hugging Face Hub.")
    parser.add_argument("--dry-run", action="store_true", help="Perform a dry run without uploading to the hub.")
    
    args = parser.parse_args()
    
    upload_to_dataset(args.original_images_dir, args.processed_images_dir, args.dataset_name, dry_run=args.dry_run)