Upload EVPRefer_warp
Browse files- model.py +2 -2
- model.safetensors +2 -2
model.py
CHANGED
@@ -286,7 +286,7 @@ class EVPRefer(nn.Module):
|
|
286 |
|
287 |
self.classifier = SimpleDecoding(dims=neck_dim)
|
288 |
|
289 |
-
self.
|
290 |
|
291 |
self.aggregation = InverseMultiAttentiveFeatureRefinement([320,680,1320,1280])
|
292 |
self.clip_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
|
@@ -311,7 +311,7 @@ class EVPRefer(nn.Module):
|
|
311 |
latents = latents / 4.7164
|
312 |
|
313 |
l_feats = self.clip_model(input_ids=input_ids).last_hidden_state
|
314 |
-
c_crossattn = self.text_adapter(latents, l_feats, self.
|
315 |
t = torch.ones((img.shape[0],), device=img.device).long()
|
316 |
outs = self.unet(latents, t, c_crossattn=[c_crossattn])
|
317 |
|
|
|
286 |
|
287 |
self.classifier = SimpleDecoding(dims=neck_dim)
|
288 |
|
289 |
+
self.alpha = nn.Parameter(torch.ones(token_embed_dim) * 1e-4)
|
290 |
|
291 |
self.aggregation = InverseMultiAttentiveFeatureRefinement([320,680,1320,1280])
|
292 |
self.clip_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
|
|
|
311 |
latents = latents / 4.7164
|
312 |
|
313 |
l_feats = self.clip_model(input_ids=input_ids).last_hidden_state
|
314 |
+
c_crossattn = self.text_adapter(latents, l_feats, self.alpha) # NOTE: here the c_crossattn should be expand_dim as latents
|
315 |
t = torch.ones((img.shape[0],), device=img.device).long()
|
316 |
outs = self.unet(latents, t, c_crossattn=[c_crossattn])
|
317 |
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:debca855a8042c58d2f5b6f22660682119d50b7684ccedacfd5953a07bb9852a
|
3 |
+
size 4317953152
|