jwyang
support arbitary size
eb1d5d5
import pathlib
import tempfile
from collections import OrderedDict
from typing import Tuple, Union
import logging
import os
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from timm.models.layers import DropPath, trunc_normal_
from .image_encoder import build_image_encoder
from .text_encoder import build_text_encoder
from .text_encoder import build_tokenizer
from .templates import DEFAULT_TEMPLATES
logger = logging.getLogger(__name__)
class UniCLModel(nn.Module):
def __init__(self, config: dict,):
super().__init__()
self.conf_lang_encoder = config['MODEL']['TEXT_ENCODER']
self.tokenizer = build_tokenizer(self.conf_lang_encoder)
self.text_encoder = build_text_encoder(self.conf_lang_encoder, self.tokenizer, config['VERBOSE'])
dim_projection = config['MODEL']['DIM_PROJECTION']
if hasattr(self.text_encoder, 'dim_out'):
dim_out = self.text_encoder.dim_out
else:
with torch.no_grad():
dim_out = self.text_encoder(
torch.zeros(1,1).type(torch.LongTensor)
)['last_hidden_state'].size(2)
self.text_projection = nn.Parameter(torch.empty(dim_out, dim_projection))
self.conf_image_encoder = config['MODEL']['IMAGE_ENCODER']
self.image_encoder = build_image_encoder(self.conf_image_encoder)
self.image_projection = nn.Parameter(
torch.empty(self.image_encoder.dim_out, dim_projection)
)
self.logit_scale = nn.Parameter(torch.ones([]))
trunc_normal_(self.text_projection, std=.02)
trunc_normal_(self.image_projection, std=.02)
def _convert_old_weights(self, model_dict):
model_dict_updated = {}
for k, v in model_dict.items():
if k.startswith('visual.'):
model_dict_updated['image_encoder.'+k[7:]] = v
elif k.startswith('text.'):
model_dict_updated['lang_encoder.'+k[5:]] = v
elif k == 'vision_projection':
model_dict_updated['image_projection'] = v
elif k == 'text_projection':
model_dict_updated['text_projection'] = v
else:
model_dict_updated[k] = v
return model_dict_updated
def from_pretrained(self, pretrained='', pretrained_layers=[], verbose=True):
if not os.path.isfile(pretrained):
logger.warning(f'=> Pretrained model ({pretrained}) is not a file, skip init weight')
return
pretrained_dict = torch.load(pretrained, map_location='cpu')
logger.info(f'=> Loading pretrained model {pretrained}')
pretrained_dict = self._convert_old_weights(pretrained_dict)
model_dict = self.state_dict()
pretrained_dict = {
k: v for k, v in pretrained_dict.items()
if k in model_dict.keys()
}
need_init_state_dict = {}
image_encoder_state_dict = {}
for k, v in pretrained_dict.items():
need_init = (
k.split('.')[0] in pretrained_layers
or pretrained_layers[0] == '*'
)
if need_init:
if k.startswith('image_encoder.'):
image_encoder_state_dict[k] = v
else:
if verbose:
logger.info(f'=> init {k} from {pretrained}')
need_init_state_dict[k] = v
self.image_encoder.from_state_dict(image_encoder_state_dict, ['*'], verbose)
self.load_state_dict(need_init_state_dict, strict=False)
@torch.jit.ignore
def no_weight_decay(self):
no_weight_decay = {'logit_scale'}
if hasattr(self.text_encoder, 'no_weight_decay'):
for k in self.text_encoder.no_weight_decay():
no_weight_decay.add('lang_encoder.'+k)
if hasattr(self.image_encoder, 'no_weight_decay'):
for k in self.image_encoder.no_weight_decay():
no_weight_decay.add('image_encoder.'+k)
return no_weight_decay
@property
def dtype(self):
return self.logit_scale.dtype
def get_imnet_embeddings(self):
templates = IMAGENET_DEFAULT_TEMPLATES[:1]
clss_embeddings = []
for clss in IMAGENET_CLASSES:
txts = [template.format(clss) for template in templates]
tokens = self.tokenizer(
txts, padding='max_length', truncation=True, max_length=77, return_tensors='pt'
)
tokens = {key:(val.cuda() if next(self.parameters()).is_cuda else val) for key,val in tokens.items()}
clss_embedding = self.encode_text(tokens)
clss_embedding = clss_embedding.mean(dim=0)
clss_embedding /= clss_embedding.norm()
clss_embeddings.append(clss_embedding)
imnet_text_embeddings = torch.stack(clss_embeddings, dim=0)
return imnet_text_embeddings
def get_text_embeddings(self, texts):
templates = DEFAULT_TEMPLATES[:1]
clss_embeddings = []
for clss in texts:
txts = [template.format(clss) for template in templates]
tokens = self.tokenizer(
txts, padding='max_length', truncation=True, max_length=77, return_tensors='pt'
)
tokens = {key:(val.cuda() if next(self.parameters()).is_cuda else val) for key,val in tokens.items()}
clss_embedding = self.encode_text(tokens)
clss_embedding = clss_embedding.mean(dim=0)
clss_embedding /= clss_embedding.norm()
clss_embeddings.append(clss_embedding)
imnet_text_embeddings = torch.stack(clss_embeddings, dim=0)
return imnet_text_embeddings
def encode_image(self, image, norm=True, output_map=False):
x = self.image_encoder.forward_features(image, output_map=output_map)
if output_map:
x, x_map, H, W = x
x = x @ self.image_projection
if output_map:
x_map = self.image_projection.unsqueeze(0).transpose(1, 2) @ x_map
if norm:
x = x / x.norm(dim=-1, keepdim=True)
if output_map:
x_map = x_map / x_map.norm(dim=1, keepdim=True)
if output_map:
return x, x_map, H, W
else:
return x
def encode_text(self, text, norm=True):
x = self.text_encoder(**text)
x = x['last_hidden_state']
if self.conf_lang_encoder['TOKENIZER'] == 'clip':
x = x[torch.arange(x.size(0)), text['input_ids'].argmax(dim=-1)]
else:
x = x[:, 0]
x = x @ self.text_projection
if norm:
x = x / x.norm(dim=-1, keepdim=True)
return x
def forward(self, image, text):
features_image = self.encode_image(image)
features_text = self.encode_text(text)
# cosine similarity as logits
T = self.logit_scale.exp()
return features_image, features_text, T
def build_unicl_model(config, **kwargs):
model = UniCLModel(config)
if config['MODEL']['PRETRAINED'] != '':
pretrained_path = config['MODEL']['PRETRAINED']
from ..Utils.Utils import is_valid_url, download_file
if is_valid_url(pretrained_path):
with tempfile.TemporaryDirectory() as tmp_path:
file_local_path = pathlib.Path(tmp_path) / 'base_model.pt'
download_file(pretrained_path, file_local_path)
model.from_pretrained(str(file_local_path), config['MODEL']['PRETRAINED_LAYERS'], config['VERBOSE'])
else:
model.from_pretrained(pretrained_path, config['MODEL']['PRETRAINED_LAYERS'], config['VERBOSE'])
return model