|
|
|
|
|
|
|
|
|
|
|
|
|
"""Store dataset indexes of datapoints selected by k-means algorithm.""" |
|
from argparse import ArgumentParser |
|
import numpy as np |
|
import os |
|
import h5py as h5 |
|
import faiss |
|
|
|
|
|
def main(args): |
|
if args["which_dataset"] == "imagenet": |
|
dataset_name_prefix = "ILSVRC" |
|
im_prefix = "IN" |
|
elif args["which_dataset"] == "coco": |
|
dataset_name_prefix = "COCO" |
|
im_prefix = "COCO" |
|
else: |
|
dataset_name_prefix = args["which_dataset"] |
|
im_prefix = args["which_dataset"] |
|
|
|
filename = os.path.join( |
|
args["data_root"], |
|
"%s%s_feats_%s_%s.hdf5" |
|
% ( |
|
dataset_name_prefix, |
|
args["resolution"], |
|
args["feature_extractor"], |
|
args["backbone_feature_extractor"], |
|
), |
|
) |
|
|
|
print("Loading features %s..." % (filename)) |
|
with h5.File(filename, "r") as f: |
|
features = f["feats"][:] |
|
features = np.array(features) |
|
|
|
features /= np.linalg.norm(features, axis=1, keepdims=True) |
|
|
|
feat_dim = 2048 |
|
|
|
print("Training k-means with %i centers..." % (args["kmeans_subsampled"])) |
|
kmeans = faiss.Kmeans( |
|
feat_dim, |
|
args["kmeans_subsampled"], |
|
niter=100, |
|
verbose=True, |
|
gpu=args["gpu"], |
|
min_points_per_centroid=200, |
|
spherical=False, |
|
) |
|
kmeans.train(features.astype(np.float32)) |
|
|
|
|
|
print("Finding closest instances to centers...") |
|
index = faiss.IndexFlatL2(feat_dim) |
|
index.add(features.astype(np.float32)) |
|
D, closest_sample = index.search(kmeans.centroids, 1) |
|
|
|
net_str = ( |
|
"rn50" |
|
if args["backbone_feature_extractor"] |
|
else args["backbone_feature_extractor"] |
|
) |
|
stored_filename = "%s_res%i_%s_%s_kmeans_k%i" % ( |
|
im_prefix, |
|
args["resolution"], |
|
net_str, |
|
args["feature_extractor"], |
|
args["kmeans_subsampled"], |
|
) |
|
np.save( |
|
os.path.join(args["data_root"], stored_filename), |
|
{"center_examples": closest_sample}, |
|
) |
|
print( |
|
"Instance indexes resulting from a subsampling based on k-means have been saved in file %s!" |
|
% (stored_filename) |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = ArgumentParser( |
|
description="Storing cluster indexes for k-means-based data subsampling" |
|
) |
|
parser.add_argument( |
|
"--resolution", |
|
type=int, |
|
default=64, |
|
help="Data resolution (default: %(default)s)", |
|
) |
|
parser.add_argument( |
|
"--which_dataset", type=str, default="imagenet", help="Dataset choice." |
|
) |
|
parser.add_argument( |
|
"--data_root", |
|
type=str, |
|
default="data", |
|
help="Default location where data is stored (default: %(default)s)", |
|
) |
|
parser.add_argument( |
|
"--feature_extractor", |
|
type=str, |
|
default="classification", |
|
choices=["classification", "selfsupervised"], |
|
help="Choice of feature extractor", |
|
) |
|
parser.add_argument( |
|
"--backbone_feature_extractor", |
|
type=str, |
|
default="resnet50", |
|
choices=["resnet50"], |
|
help="Choice of feature extractor backbone", |
|
) |
|
parser.add_argument( |
|
"--kmeans_subsampled", |
|
type=int, |
|
default=-1, |
|
help="Number of k-means centers if using subsampled training instances" |
|
" (default: %(default)s)", |
|
) |
|
parser.add_argument( |
|
"--gpu", |
|
action="store_true", |
|
default=False, |
|
help="Use faiss with GPUs (default: %(default)s)", |
|
) |
|
args = vars(parser.parse_args()) |
|
main(args) |
|
|