liangfeng
add ovseg
583456e
# Copyright (c) Facebook, Inc. and its affiliates.
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
from collections import OrderedDict
import torch
import torch.nn as nn
from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, dilation=1):
super().__init__()
# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(
planes, planes, 3, padding=1 * dilation, bias=False, dilation=dilation
)
self.bn2 = nn.BatchNorm2d(planes)
self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = None
self.stride = stride
if stride > 1 or inplanes != planes * Bottleneck.expansion:
# downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
self.downsample = nn.Sequential(
OrderedDict(
[
("-1", nn.AvgPool2d(stride)),
(
"0",
nn.Conv2d(
inplanes,
planes * self.expansion,
1,
stride=1,
bias=False,
),
),
("1", nn.BatchNorm2d(planes * self.expansion)),
]
)
)
def forward(self, x: torch.Tensor):
identity = x
out = self.relu(self.bn1(self.conv1(x)))
out = self.relu(self.bn2(self.conv2(out)))
out = self.avgpool(out)
out = self.bn3(self.conv3(out))
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ModifiedResNet(nn.Module):
"""
A ResNet class that is similar to torchvision's but contains the following changes:
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
- The final pooling layer is a QKV attention instead of an average pool
"""
def __init__(self, layers, width=64, strides=[2, 1, 2, 2, 2], multi_grid=[1, 1, 1]):
super().__init__()
# the 3-layer stem
self.conv1 = nn.Conv2d(
3, width // 2, kernel_size=3, stride=2, padding=1, bias=False
)
self.bn1 = nn.BatchNorm2d(width // 2)
self.conv2 = nn.Conv2d(
width // 2, width // 2, kernel_size=3, padding=1, bias=False
)
self.bn2 = nn.BatchNorm2d(width // 2)
self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
self.bn3 = nn.BatchNorm2d(width)
self.avgpool = nn.AvgPool2d(strides[0]) if strides[0] > 1 else nn.Identity()
self.relu = nn.ReLU(inplace=True)
# residual layers
self._inplanes = width # this is a *mutable* variable used during construction
self.layer1 = self._make_layer(width, layers[0], stride=strides[1])
self.layer2 = self._make_layer(width * 2, layers[1], stride=strides[2])
self.layer3 = self._make_layer(width * 4, layers[2], stride=strides[3])
self.layer4 = self._make_layer(
width * 8, layers[3], stride=strides[4], dilations=multi_grid
)
self.num_features = [width * 4, width * 8, width * 16, width * 32]
def _make_layer(self, planes, blocks, stride=1, dilations=None):
if dilations is None:
dilations = [1] * blocks
layers = [Bottleneck(self._inplanes, planes, stride, dilation=dilations[0])]
self._inplanes = planes * Bottleneck.expansion
for i in range(1, blocks):
layers.append(Bottleneck(self._inplanes, planes, dilation=dilations[i]))
return nn.Sequential(*layers)
def forward(self, x):
def stem(x):
for conv, bn in [
(self.conv1, self.bn1),
(self.conv2, self.bn2),
(self.conv3, self.bn3),
]:
x = self.relu(bn(conv(x)))
x = self.avgpool(x)
return x
output = {}
x = x.type(self.conv1.weight.dtype)
x = stem(x) # 1/4,1/4
x = self.layer1(x)
output["res2"] = x
x = self.layer2(x) # 1/8,1/8
output["res3"] = x
x = self.layer3(x) # 1/16,1/16
output["res4"] = x
x = self.layer4(x) # 1/32,1/32
output["res5"] = x
return output
@BACKBONE_REGISTRY.register()
class D2ModifiedResNet(ModifiedResNet, Backbone):
def __init__(self, cfg, input_shape):
depth = cfg.MODEL.RESNETS.DEPTH
num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
bottleneck_channels = num_groups * width_per_group
num_blocks_per_stage = {
18: [2, 2, 2, 2],
34: [3, 4, 6, 3],
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
152: [3, 8, 36, 3],
}[depth]
strides = [2, 1, 2, 2, 2]
multi_grid = cfg.MODEL.RESNETS.RES5_MULTI_GRID
if cfg.MODEL.RESNETS.STEM_TYPE == "deeplab":
strides = [1, 1, 2, 2, 2]
super().__init__(
num_blocks_per_stage,
bottleneck_channels,
strides=strides,
multi_grid=multi_grid,
)
self._out_features = cfg.MODEL.RESNETS.OUT_FEATURES
self._out_feature_strides = {
"res2": 4,
"res3": 8,
"res4": 16,
"res5": 32,
}
self._out_feature_channels = {
"res2": self.num_features[0],
"res3": self.num_features[1],
"res4": self.num_features[2],
"res5": self.num_features[3],
}
def forward(self, x):
"""
Args:
x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
Returns:
dict[str->Tensor]: names and the corresponding features
"""
outputs = {}
y = super().forward(x)
for k in y.keys():
if k in self._out_features:
outputs[k] = y[k]
return outputs
def output_shape(self):
return {
name: ShapeSpec(
channels=self._out_feature_channels[name],
stride=self._out_feature_strides[name],
)
for name in self._out_features
}
@property
def size_divisibility(self):
return 32