amyeroberts
HF staff
Duplicate from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
e9c76ea
verified
license: apache-2.0 | |
Same as https://huggingface.co/HuggingFaceM4/siglip-so400m-14-384-flash-attn2 with two changes: | |
- increase max resolution to 980 x 980 (instead of 384 x 384) by interpolating the position embeddings | |
- implement the strategy in [NaViT](https://arxiv.org/abs/2307.06304) to allow a/ variable resoltion images, b/ aspect ratio preserved images | |
These changes only apply to the vision tower. No changes to the text tower. | |
Implementation is fully backward compatible to `https://huggingface.co/HuggingFaceM4/siglip-so400m-14-384-flash-attn2` -> just don't specify the `patch_attention_mask` | |
Usage: | |
```python | |
import torch | |
from modeling_siglip import SiglipVisionModel | |
DEVICE = torch.device("cuda:0") | |
PATCH_SIZE = 14 | |
pixel_values = torch.randn(2, 3, 28, 42, dtype=torch.bfloat16, device=DEVICE) | |
pixel_attention_mask = [ | |
[ | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[1] * 14 + [1] * 14 + [1] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
[0] * 14 + [0] * 14 + [0] * 14, | |
], | |
[ | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
[1] * 14 + [1] * 14 + [0] * 14, | |
], | |
] | |
pixel_attention_mask = torch.tensor(pixel_attention_mask, dtype=torch.bool, device=DEVICE) | |
patches_subgrid = pixel_attention_mask.unfold( | |
dimension=1, size=PATCH_SIZE, step=PATCH_SIZE | |
).unfold(dimension=2, size=PATCH_SIZE, step=PATCH_SIZE) | |
patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() | |
model = SiglipVisionModel.from_pretrained("HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit", _flash_attn_2_enabled=True) | |
model.train() | |
model.vision_model.to(DEVICE, dtype=torch.bfloat16) | |
output = model.vision_model(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask) | |
``` |