Spaces:
Build error
Build error
Liangyu
commited on
Commit
β’
c7f0cc1
1
Parent(s):
1e03b30
add functions
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- OpenPSG/checkpoints/epoch_60.pth +3 -0
- OpenPSG/configs/_base_/custom_runtime.py +17 -0
- OpenPSG/configs/_base_/datasets/psg.py +93 -0
- OpenPSG/configs/_base_/datasets/psg_panoptic.py +72 -0
- OpenPSG/configs/_base_/datasets/vg_detection.py +56 -0
- OpenPSG/configs/_base_/datasets/vg_sg.py +57 -0
- OpenPSG/configs/_base_/models/detr4seg_r101.py +64 -0
- OpenPSG/configs/_base_/models/detr4seg_r101_psg.py +137 -0
- OpenPSG/configs/_base_/models/detr4seg_r50.py +65 -0
- OpenPSG/configs/_base_/models/detr4seg_r50_psg.py +152 -0
- OpenPSG/configs/_base_/models/detr_r50.py +64 -0
- OpenPSG/configs/_base_/models/mask_rcnn_r50_fpn.py +107 -0
- OpenPSG/configs/_base_/models/panoptic_fpn_r101_fpn_psg.py +8 -0
- OpenPSG/configs/_base_/models/panoptic_fpn_r50_fpn_psg.py +74 -0
- OpenPSG/configs/_base_/models/psgtr_r101.py +5 -0
- OpenPSG/configs/_base_/models/psgtr_r50.py +82 -0
- OpenPSG/configs/_base_/schedules/schedule_1x.py +10 -0
- OpenPSG/configs/_base_/schedules/schedule_3x.py +10 -0
- OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_predcls_psg.py +26 -0
- OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_sgdet_psg.py +26 -0
- OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_predcls_psg.py +41 -0
- OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_sgdet_psg.py +45 -0
- OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_predcls_psg.py +28 -0
- OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_sgdet_psg.py +26 -0
- OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_predcls_psg.py +44 -0
- OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_sgdet_psg.py +48 -0
- OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_predcls_psg.py +28 -0
- OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_sgdet_psg.py +28 -0
- OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py +241 -0
- OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_sgdet_psg.py +44 -0
- OpenPSG/configs/psgformer/psgformer_r101_psg.py +16 -0
- OpenPSG/configs/psgformer/psgformer_r50.py +96 -0
- OpenPSG/configs/psgformer/psgformer_r50_psg.py +244 -0
- OpenPSG/configs/psgformer/psgformer_r50_psg_inference.py +31 -0
- OpenPSG/configs/psgtr/psgtr_r101_psg.py +231 -0
- OpenPSG/configs/psgtr/psgtr_r50.py +82 -0
- OpenPSG/configs/psgtr/psgtr_r50_psg.py +233 -0
- OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py +31 -0
- OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_predcls_psg.py +28 -0
- OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_sgdet_psg.py +28 -0
- OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_predcls_psg.py +43 -0
- OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_sgdet_psg.py +49 -0
- README.md +3 -3
- app.py +133 -13
- fake_gan.py +56 -0
- images/cooking.jpg +0 -0
- images/forrest-gump.jpg +0 -0
- images/friends.jpg +0 -0
- images/mbappe.jpg +0 -0
- images/messi.jpg +0 -0
OpenPSG/checkpoints/epoch_60.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c4ddcbda74686568b7e6b8145f7f33030407e27e390c37c23206f95c51829ed
|
3 |
+
size 531751994
|
OpenPSG/configs/_base_/custom_runtime.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
checkpoint_config = dict(interval=1, max_keep_ckpts=1)
|
2 |
+
# yapf:disable
|
3 |
+
log_config = dict(
|
4 |
+
interval=50,
|
5 |
+
hooks=[
|
6 |
+
dict(type='TextLoggerHook'),
|
7 |
+
# dict(type='TensorboardLoggerHook')
|
8 |
+
])
|
9 |
+
# yapf:enable
|
10 |
+
custom_hooks = [dict(type='NumClassCheckHook')]
|
11 |
+
|
12 |
+
dist_params = dict(backend='nccl')
|
13 |
+
log_level = 'INFO'
|
14 |
+
load_from = None
|
15 |
+
resume_from = None
|
16 |
+
|
17 |
+
workflow = [('train', 1), ('val', 1)]
|
OpenPSG/configs/_base_/datasets/psg.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'PanopticSceneGraphDataset'
|
3 |
+
ann_file = './data/psg/psg.json'
|
4 |
+
coco_root = 'data/coco'
|
5 |
+
|
6 |
+
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
|
7 |
+
std=[58.395, 57.12, 57.375],
|
8 |
+
to_rgb=True)
|
9 |
+
train_pipeline = [
|
10 |
+
dict(type='LoadImageFromFile'),
|
11 |
+
dict(
|
12 |
+
type='LoadPanopticSceneGraphAnnotations',
|
13 |
+
with_bbox=True,
|
14 |
+
with_rel=True,
|
15 |
+
with_mask=True,
|
16 |
+
with_seg=True,
|
17 |
+
),
|
18 |
+
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
|
19 |
+
dict(type='RandomFlip', flip_ratio=0.5),
|
20 |
+
dict(type='Normalize', **img_norm_cfg),
|
21 |
+
dict(type='Pad', size_divisor=32),
|
22 |
+
dict(type='SegRescale', scale_factor=1 / 4),
|
23 |
+
dict(type='SceneGraphFormatBundle'),
|
24 |
+
dict(
|
25 |
+
type='Collect',
|
26 |
+
keys=[
|
27 |
+
'img',
|
28 |
+
'gt_bboxes',
|
29 |
+
'gt_labels',
|
30 |
+
'gt_rels',
|
31 |
+
'gt_relmaps',
|
32 |
+
'gt_masks',
|
33 |
+
'gt_semantic_seg',
|
34 |
+
],
|
35 |
+
),
|
36 |
+
]
|
37 |
+
test_pipeline = [
|
38 |
+
dict(type='LoadImageFromFile'),
|
39 |
+
# Since the forward process may need gt info, annos must be loaded.
|
40 |
+
dict(type='LoadPanopticSceneGraphAnnotations',
|
41 |
+
with_bbox=True,
|
42 |
+
with_rel=True),
|
43 |
+
dict(
|
44 |
+
type='MultiScaleFlipAug',
|
45 |
+
img_scale=(1333, 800),
|
46 |
+
flip=False,
|
47 |
+
transforms=[
|
48 |
+
dict(type='Resize', keep_ratio=True),
|
49 |
+
dict(type='RandomFlip'),
|
50 |
+
dict(type='Normalize', **img_norm_cfg),
|
51 |
+
dict(type='Pad', size_divisor=32),
|
52 |
+
# NOTE: Do not change the img to DC.
|
53 |
+
dict(type='ImageToTensor', keys=['img']),
|
54 |
+
dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
|
55 |
+
dict(
|
56 |
+
type='ToDataContainer',
|
57 |
+
fields=(dict(key='gt_bboxes'), dict(key='gt_labels')),
|
58 |
+
),
|
59 |
+
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
|
60 |
+
],
|
61 |
+
),
|
62 |
+
]
|
63 |
+
data = dict(
|
64 |
+
samples_per_gpu=2,
|
65 |
+
workers_per_gpu=2,
|
66 |
+
train=dict(
|
67 |
+
type=dataset_type,
|
68 |
+
ann_file=ann_file,
|
69 |
+
img_prefix=coco_root,
|
70 |
+
seg_prefix=coco_root,
|
71 |
+
pipeline=train_pipeline,
|
72 |
+
split='train',
|
73 |
+
all_bboxes=True,
|
74 |
+
),
|
75 |
+
val=dict(
|
76 |
+
type=dataset_type,
|
77 |
+
ann_file=ann_file,
|
78 |
+
img_prefix=coco_root,
|
79 |
+
seg_prefix=coco_root,
|
80 |
+
pipeline=test_pipeline,
|
81 |
+
split='test',
|
82 |
+
all_bboxes=True,
|
83 |
+
),
|
84 |
+
test=dict(
|
85 |
+
type=dataset_type,
|
86 |
+
ann_file=ann_file,
|
87 |
+
img_prefix=coco_root,
|
88 |
+
seg_prefix=coco_root,
|
89 |
+
pipeline=test_pipeline,
|
90 |
+
split='test',
|
91 |
+
all_bboxes=True,
|
92 |
+
),
|
93 |
+
)
|
OpenPSG/configs/_base_/datasets/psg_panoptic.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'PanopticSceneGraphDataset'
|
3 |
+
ann_file = './data/psg/psg.json'
|
4 |
+
coco_root = './data/coco'
|
5 |
+
|
6 |
+
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
|
7 |
+
std=[58.395, 57.12, 57.375],
|
8 |
+
to_rgb=True)
|
9 |
+
train_pipeline = [
|
10 |
+
dict(type='LoadImageFromFile'),
|
11 |
+
dict(
|
12 |
+
type='LoadPanopticSceneGraphAnnotations',
|
13 |
+
with_bbox=True,
|
14 |
+
with_mask=True,
|
15 |
+
with_seg=True,
|
16 |
+
),
|
17 |
+
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
|
18 |
+
dict(type='RandomFlip', flip_ratio=0.5),
|
19 |
+
dict(type='Normalize', **img_norm_cfg),
|
20 |
+
dict(type='Pad', size_divisor=32),
|
21 |
+
dict(type='SegRescale', scale_factor=1 / 4),
|
22 |
+
dict(type='DefaultFormatBundle'),
|
23 |
+
dict(
|
24 |
+
type='Collect',
|
25 |
+
keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg'],
|
26 |
+
),
|
27 |
+
]
|
28 |
+
test_pipeline = [
|
29 |
+
dict(type='LoadImageFromFile'),
|
30 |
+
dict(
|
31 |
+
type='MultiScaleFlipAug',
|
32 |
+
img_scale=(1333, 800),
|
33 |
+
flip=False,
|
34 |
+
transforms=[
|
35 |
+
dict(type='Resize', keep_ratio=True),
|
36 |
+
dict(type='RandomFlip'),
|
37 |
+
dict(type='Normalize', **img_norm_cfg),
|
38 |
+
dict(type='Pad', size_divisor=32),
|
39 |
+
dict(type='ImageToTensor', keys=['img']),
|
40 |
+
dict(type='Collect', keys=['img']),
|
41 |
+
],
|
42 |
+
),
|
43 |
+
]
|
44 |
+
data = dict(
|
45 |
+
samples_per_gpu=2,
|
46 |
+
workers_per_gpu=2,
|
47 |
+
train=dict(
|
48 |
+
type=dataset_type,
|
49 |
+
ann_file=ann_file,
|
50 |
+
img_prefix=coco_root,
|
51 |
+
seg_prefix=coco_root,
|
52 |
+
pipeline=train_pipeline,
|
53 |
+
split='train',
|
54 |
+
),
|
55 |
+
val=dict(
|
56 |
+
type=dataset_type,
|
57 |
+
ann_file=ann_file,
|
58 |
+
img_prefix=coco_root,
|
59 |
+
seg_prefix=coco_root,
|
60 |
+
pipeline=test_pipeline,
|
61 |
+
split='test',
|
62 |
+
),
|
63 |
+
test=dict(
|
64 |
+
type=dataset_type,
|
65 |
+
ann_file=ann_file,
|
66 |
+
img_prefix=coco_root,
|
67 |
+
seg_prefix=coco_root,
|
68 |
+
pipeline=test_pipeline,
|
69 |
+
split='test',
|
70 |
+
),
|
71 |
+
)
|
72 |
+
evaluation = dict(interval=1, metric='PQ')
|
OpenPSG/configs/_base_/datasets/vg_detection.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
custom_imports = dict(imports=[
|
3 |
+
'openpsg.datasets',
|
4 |
+
'openpsg.datasets.pipelines',
|
5 |
+
],
|
6 |
+
allow_failed_imports=False)
|
7 |
+
|
8 |
+
dataset_type = 'SceneGraphDataset'
|
9 |
+
ann_file = 'data/vg/data_openpsg.json'
|
10 |
+
img_dir = 'data/vg/VG_100K'
|
11 |
+
|
12 |
+
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
|
13 |
+
std=[58.395, 57.12, 57.375],
|
14 |
+
to_rgb=True)
|
15 |
+
train_pipeline = [
|
16 |
+
dict(type='LoadImageFromFile'),
|
17 |
+
dict(type='LoadSceneGraphAnnotations', with_bbox=True),
|
18 |
+
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
|
19 |
+
dict(type='RandomFlip', flip_ratio=0.5),
|
20 |
+
dict(type='Normalize', **img_norm_cfg),
|
21 |
+
dict(type='Pad', size_divisor=32),
|
22 |
+
dict(type='DefaultFormatBundle'),
|
23 |
+
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
|
24 |
+
]
|
25 |
+
test_pipeline = [
|
26 |
+
dict(type='LoadImageFromFile'),
|
27 |
+
dict(type='MultiScaleFlipAug',
|
28 |
+
img_scale=(1333, 800),
|
29 |
+
flip=False,
|
30 |
+
transforms=[
|
31 |
+
dict(type='Resize', keep_ratio=True),
|
32 |
+
dict(type='RandomFlip'),
|
33 |
+
dict(type='Normalize', **img_norm_cfg),
|
34 |
+
dict(type='Pad', size_divisor=32),
|
35 |
+
dict(type='ImageToTensor', keys=['img']),
|
36 |
+
dict(type='Collect', keys=['img']),
|
37 |
+
])
|
38 |
+
]
|
39 |
+
data = dict(samples_per_gpu=2,
|
40 |
+
workers_per_gpu=2,
|
41 |
+
train=dict(type=dataset_type,
|
42 |
+
ann_file=ann_file,
|
43 |
+
img_prefix=img_dir,
|
44 |
+
pipeline=train_pipeline,
|
45 |
+
split='train'),
|
46 |
+
val=dict(type=dataset_type,
|
47 |
+
ann_file=ann_file,
|
48 |
+
img_prefix=img_dir,
|
49 |
+
pipeline=test_pipeline,
|
50 |
+
split='test'),
|
51 |
+
test=dict(type=dataset_type,
|
52 |
+
ann_file=ann_file,
|
53 |
+
img_prefix=img_dir,
|
54 |
+
pipeline=test_pipeline,
|
55 |
+
split='test'))
|
56 |
+
evaluation = dict(interval=1, metric='bbox')
|
OpenPSG/configs/_base_/datasets/vg_sg.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'SceneGraphDataset'
|
3 |
+
ann_file = '/mnt/ssd/gzj/data/VisualGenome/data_openpsg.json'
|
4 |
+
img_dir = '/mnt/ssd/gzj/data/VisualGenome/VG_100K'
|
5 |
+
|
6 |
+
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
|
7 |
+
std=[58.395, 57.12, 57.375],
|
8 |
+
to_rgb=True)
|
9 |
+
train_pipeline = [
|
10 |
+
dict(type='LoadImageFromFile'),
|
11 |
+
dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
|
12 |
+
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
|
13 |
+
dict(type='RandomFlip', flip_ratio=0.5),
|
14 |
+
dict(type='Normalize', **img_norm_cfg),
|
15 |
+
dict(type='Pad', size_divisor=32),
|
16 |
+
dict(type='SceneGraphFormatBundle'),
|
17 |
+
dict(type='Collect',
|
18 |
+
keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_relmaps']),
|
19 |
+
]
|
20 |
+
test_pipeline = [
|
21 |
+
dict(type='LoadImageFromFile'),
|
22 |
+
# Since the forward process may need gt info, annos must be loaded.
|
23 |
+
dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
|
24 |
+
dict(
|
25 |
+
type='MultiScaleFlipAug',
|
26 |
+
img_scale=(1333, 800),
|
27 |
+
flip=False,
|
28 |
+
transforms=[
|
29 |
+
dict(type='Resize', keep_ratio=True),
|
30 |
+
dict(type='RandomFlip'),
|
31 |
+
dict(type='Normalize', **img_norm_cfg),
|
32 |
+
dict(type='Pad', size_divisor=32),
|
33 |
+
# NOTE: Do not change the img to DC.
|
34 |
+
dict(type='ImageToTensor', keys=['img']),
|
35 |
+
dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
|
36 |
+
dict(type='ToDataContainer',
|
37 |
+
fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
|
38 |
+
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
|
39 |
+
])
|
40 |
+
]
|
41 |
+
data = dict(samples_per_gpu=2,
|
42 |
+
workers_per_gpu=2,
|
43 |
+
train=dict(type=dataset_type,
|
44 |
+
ann_file=ann_file,
|
45 |
+
img_prefix=img_dir,
|
46 |
+
pipeline=train_pipeline,
|
47 |
+
split='train'),
|
48 |
+
val=dict(type=dataset_type,
|
49 |
+
ann_file=ann_file,
|
50 |
+
img_prefix=img_dir,
|
51 |
+
pipeline=test_pipeline,
|
52 |
+
split='test'),
|
53 |
+
test=dict(type=dataset_type,
|
54 |
+
ann_file=ann_file,
|
55 |
+
img_prefix=img_dir,
|
56 |
+
pipeline=test_pipeline,
|
57 |
+
split='test'))
|
OpenPSG/configs/_base_/models/detr4seg_r101.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='DETR4seg',
|
3 |
+
backbone=dict(type='ResNet',
|
4 |
+
depth=101,
|
5 |
+
num_stages=4,
|
6 |
+
out_indices=(0, 1, 2, 3),
|
7 |
+
frozen_stages=1,
|
8 |
+
norm_cfg=dict(type='BN', requires_grad=False),
|
9 |
+
norm_eval=True,
|
10 |
+
style='pytorch',
|
11 |
+
init_cfg=dict(type='Pretrained',
|
12 |
+
checkpoint='torchvision://resnet101')),
|
13 |
+
bbox_head=dict(type='detr4segHead',
|
14 |
+
num_classes=80,
|
15 |
+
in_channels=2048,
|
16 |
+
transformer=dict(
|
17 |
+
type='Transformer',
|
18 |
+
encoder=dict(type='DetrTransformerEncoder',
|
19 |
+
num_layers=6,
|
20 |
+
transformerlayers=dict(
|
21 |
+
type='BaseTransformerLayer',
|
22 |
+
attn_cfgs=[
|
23 |
+
dict(type='MultiheadAttention',
|
24 |
+
embed_dims=256,
|
25 |
+
num_heads=8,
|
26 |
+
dropout=0.1)
|
27 |
+
],
|
28 |
+
feedforward_channels=2048,
|
29 |
+
ffn_dropout=0.1,
|
30 |
+
operation_order=('self_attn', 'norm',
|
31 |
+
'ffn', 'norm'))),
|
32 |
+
decoder=dict(
|
33 |
+
type='DetrTransformerDecoder',
|
34 |
+
return_intermediate=True,
|
35 |
+
num_layers=6,
|
36 |
+
transformerlayers=dict(
|
37 |
+
type='DetrTransformerDecoderLayer',
|
38 |
+
attn_cfgs=dict(type='MultiheadAttention',
|
39 |
+
embed_dims=256,
|
40 |
+
num_heads=8,
|
41 |
+
dropout=0.1),
|
42 |
+
feedforward_channels=2048,
|
43 |
+
ffn_dropout=0.1,
|
44 |
+
operation_order=('self_attn', 'norm',
|
45 |
+
'cross_attn', 'norm', 'ffn',
|
46 |
+
'norm')),
|
47 |
+
)),
|
48 |
+
positional_encoding=dict(type='SinePositionalEncoding',
|
49 |
+
num_feats=128,
|
50 |
+
normalize=True),
|
51 |
+
loss_cls=dict(type='CrossEntropyLoss',
|
52 |
+
use_sigmoid=False,
|
53 |
+
loss_weight=1.0,
|
54 |
+
class_weight=1.0),
|
55 |
+
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
|
56 |
+
loss_iou=dict(type='GIoULoss', loss_weight=2.0),
|
57 |
+
dice_loss=dict(type='DiceLoss', loss_weight=1.0)),
|
58 |
+
# training and testing settings
|
59 |
+
train_cfg=dict(assigner=dict(
|
60 |
+
type='HungarianAssigner',
|
61 |
+
cls_cost=dict(type='ClassificationCost', weight=1.),
|
62 |
+
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
|
63 |
+
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
|
64 |
+
test_cfg=dict(max_per_img=100))
|
OpenPSG/configs/_base_/models/detr4seg_r101_psg.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../_base_/models/detr4seg_r101.py', '../_base_/datasets/psg.py',
|
3 |
+
'../_base_/custom_runtime.py'
|
4 |
+
]
|
5 |
+
|
6 |
+
custom_imports = dict(imports=[
|
7 |
+
'openpsg.models.frameworks.detr4seg',
|
8 |
+
'openpsg.models.relation_heads.detr4seg_head', 'openpsg.datasets',
|
9 |
+
'openpsg.datasets.pipelines.loading',
|
10 |
+
'openpsg.datasets.pipelines.rel_randomcrop',
|
11 |
+
'openpsg.models.relation_heads.approaches.matcher',
|
12 |
+
'openpsg.models.losses.seg_losses'
|
13 |
+
],
|
14 |
+
allow_failed_imports=False)
|
15 |
+
|
16 |
+
object_classes = [
|
17 |
+
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
|
18 |
+
'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
|
19 |
+
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
|
20 |
+
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
|
21 |
+
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
|
22 |
+
'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
|
23 |
+
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
|
24 |
+
'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
|
25 |
+
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
|
26 |
+
'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
|
27 |
+
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
|
28 |
+
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
|
29 |
+
'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
|
30 |
+
'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
|
31 |
+
'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
|
32 |
+
'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
|
33 |
+
'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
|
34 |
+
'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
|
35 |
+
'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
|
36 |
+
'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
|
37 |
+
'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
|
38 |
+
'food-other-merged', 'building-other-merged', 'rock-merged',
|
39 |
+
'wall-other-merged', 'rug-merged'
|
40 |
+
]
|
41 |
+
|
42 |
+
model = dict(bbox_head=dict(
|
43 |
+
num_classes=len(object_classes),
|
44 |
+
object_classes=object_classes,
|
45 |
+
))
|
46 |
+
|
47 |
+
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
|
48 |
+
std=[58.395, 57.12, 57.375],
|
49 |
+
to_rgb=True)
|
50 |
+
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
|
51 |
+
# from the default setting in mmdet.
|
52 |
+
train_pipeline = [
|
53 |
+
dict(type='LoadImageFromFile'),
|
54 |
+
dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
|
55 |
+
dict(type='RandomFlip', flip_ratio=0.5),
|
56 |
+
dict(
|
57 |
+
type='AutoAugment',
|
58 |
+
policies=[
|
59 |
+
[
|
60 |
+
dict(type='Resize',
|
61 |
+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
|
62 |
+
(576, 1333), (608, 1333), (640, 1333),
|
63 |
+
(672, 1333), (704, 1333), (736, 1333),
|
64 |
+
(768, 1333), (800, 1333)],
|
65 |
+
multiscale_mode='value',
|
66 |
+
keep_ratio=True)
|
67 |
+
],
|
68 |
+
[
|
69 |
+
dict(type='Resize',
|
70 |
+
img_scale=[(400, 1333), (500, 1333), (600, 1333)],
|
71 |
+
multiscale_mode='value',
|
72 |
+
keep_ratio=True),
|
73 |
+
dict(type='RandomCrop',
|
74 |
+
crop_type='absolute_range',
|
75 |
+
crop_size=(384, 600),
|
76 |
+
allow_negative_crop=False), # no empty relations
|
77 |
+
dict(type='Resize',
|
78 |
+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
|
79 |
+
(576, 1333), (608, 1333), (640, 1333),
|
80 |
+
(672, 1333), (704, 1333), (736, 1333),
|
81 |
+
(768, 1333), (800, 1333)],
|
82 |
+
multiscale_mode='value',
|
83 |
+
override=True,
|
84 |
+
keep_ratio=True)
|
85 |
+
]
|
86 |
+
]),
|
87 |
+
dict(type='Normalize', **img_norm_cfg),
|
88 |
+
dict(type='Pad', size_divisor=1),
|
89 |
+
dict(type='RelsFormatBundle'),
|
90 |
+
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
|
91 |
+
]
|
92 |
+
# test_pipeline, NOTE the Pad's size_divisor is different from the default
|
93 |
+
# setting (size_divisor=32). While there is little effect on the performance
|
94 |
+
# whether we use the default setting or use size_divisor=1.
|
95 |
+
test_pipeline = [
|
96 |
+
dict(type='LoadImageFromFile'),
|
97 |
+
dict(type='MultiScaleFlipAug',
|
98 |
+
img_scale=(1333, 800),
|
99 |
+
flip=False,
|
100 |
+
transforms=[
|
101 |
+
dict(type='Resize', keep_ratio=True),
|
102 |
+
dict(type='RandomFlip'),
|
103 |
+
dict(type='Normalize', **img_norm_cfg),
|
104 |
+
dict(type='Pad', size_divisor=1),
|
105 |
+
dict(type='ImageToTensor', keys=['img']),
|
106 |
+
dict(type='Collect', keys=['img'])
|
107 |
+
])
|
108 |
+
]
|
109 |
+
data = dict(samples_per_gpu=2,
|
110 |
+
workers_per_gpu=2,
|
111 |
+
train=dict(pipeline=train_pipeline),
|
112 |
+
val=dict(pipeline=test_pipeline),
|
113 |
+
test=dict(pipeline=test_pipeline))
|
114 |
+
# optimizer
|
115 |
+
optimizer = dict(
|
116 |
+
type='AdamW',
|
117 |
+
lr=0.0001,
|
118 |
+
weight_decay=0.0001,
|
119 |
+
paramwise_cfg=dict(
|
120 |
+
custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
|
121 |
+
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
|
122 |
+
|
123 |
+
# learning policy
|
124 |
+
lr_config = dict(policy='step', step=110)
|
125 |
+
runner = dict(type='EpochBasedRunner', max_epochs=150)
|
126 |
+
|
127 |
+
project_name = 'detr4seg'
|
128 |
+
expt_name = 'detr4seg_r101_coco'
|
129 |
+
work_dir = f'./work_dirs/{expt_name}'
|
130 |
+
|
131 |
+
log_config = dict(
|
132 |
+
interval=50,
|
133 |
+
hooks=[dict(type='TextLoggerHook'),
|
134 |
+
dict(type='TensorboardLoggerHook')],
|
135 |
+
)
|
136 |
+
|
137 |
+
load_from = '/mnt/ssd/gzj/test/OpenPSG/detr_r50_fb_origin.pth'
|
OpenPSG/configs/_base_/models/detr4seg_r50.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='DETR4seg',
|
3 |
+
backbone=dict(type='ResNet',
|
4 |
+
depth=50,
|
5 |
+
num_stages=4,
|
6 |
+
out_indices=(0, 1, 2, 3),
|
7 |
+
frozen_stages=1,
|
8 |
+
norm_cfg=dict(type='BN', requires_grad=False),
|
9 |
+
norm_eval=True,
|
10 |
+
style='pytorch',
|
11 |
+
init_cfg=dict(type='Pretrained',
|
12 |
+
checkpoint='torchvision://resnet50')),
|
13 |
+
bbox_head=dict(type='detr4segHead',
|
14 |
+
num_classes=80,
|
15 |
+
in_channels=2048,
|
16 |
+
transformer=dict(
|
17 |
+
type='Transformer',
|
18 |
+
encoder=dict(type='DetrTransformerEncoder',
|
19 |
+
num_layers=6,
|
20 |
+
transformerlayers=dict(
|
21 |
+
type='BaseTransformerLayer',
|
22 |
+
attn_cfgs=[
|
23 |
+
dict(type='MultiheadAttention',
|
24 |
+
embed_dims=256,
|
25 |
+
num_heads=8,
|
26 |
+
dropout=0.1)
|
27 |
+
],
|
28 |
+
feedforward_channels=2048,
|
29 |
+
ffn_dropout=0.1,
|
30 |
+
operation_order=('self_attn', 'norm',
|
31 |
+
'ffn', 'norm'))),
|
32 |
+
decoder=dict(
|
33 |
+
type='DetrTransformerDecoder',
|
34 |
+
return_intermediate=True,
|
35 |
+
num_layers=6,
|
36 |
+
transformerlayers=dict(
|
37 |
+
type='DetrTransformerDecoderLayer',
|
38 |
+
attn_cfgs=dict(type='MultiheadAttention',
|
39 |
+
embed_dims=256,
|
40 |
+
num_heads=8,
|
41 |
+
dropout=0.1),
|
42 |
+
feedforward_channels=2048,
|
43 |
+
ffn_dropout=0.1,
|
44 |
+
operation_order=('self_attn', 'norm',
|
45 |
+
'cross_attn', 'norm', 'ffn',
|
46 |
+
'norm')),
|
47 |
+
)),
|
48 |
+
positional_encoding=dict(type='SinePositionalEncoding',
|
49 |
+
num_feats=128,
|
50 |
+
normalize=True),
|
51 |
+
loss_cls=dict(type='CrossEntropyLoss',
|
52 |
+
use_sigmoid=False,
|
53 |
+
loss_weight=1.0,
|
54 |
+
class_weight=1.0),
|
55 |
+
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
|
56 |
+
loss_iou=dict(type='GIoULoss', loss_weight=2.0),
|
57 |
+
focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
|
58 |
+
dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0)),
|
59 |
+
# training and testing settings
|
60 |
+
train_cfg=dict(assigner=dict(
|
61 |
+
type='HungarianAssigner',
|
62 |
+
cls_cost=dict(type='ClassificationCost', weight=1.),
|
63 |
+
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
|
64 |
+
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
|
65 |
+
test_cfg=dict(max_per_img=100))
|
OpenPSG/configs/_base_/models/detr4seg_r50_psg.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = ['./detr4seg_r50.py', '../datasets/psg.py', '../custom_runtime.py']
|
2 |
+
|
3 |
+
custom_imports = dict(imports=[
|
4 |
+
'openpsg.models.frameworks.detr4seg',
|
5 |
+
'openpsg.models.relation_heads.detr4seg_head', 'openpsg.datasets',
|
6 |
+
'openpsg.datasets.pipelines.loading',
|
7 |
+
'openpsg.datasets.pipelines.rel_randomcrop',
|
8 |
+
'openpsg.models.relation_heads.approaches.matcher',
|
9 |
+
'openpsg.models.losses.seg_losses'
|
10 |
+
],
|
11 |
+
allow_failed_imports=False)
|
12 |
+
|
13 |
+
object_classes = [
|
14 |
+
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
|
15 |
+
'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
|
16 |
+
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
|
17 |
+
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
|
18 |
+
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
|
19 |
+
'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
|
20 |
+
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
|
21 |
+
'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
|
22 |
+
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
|
23 |
+
'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
|
24 |
+
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
|
25 |
+
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
|
26 |
+
'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
|
27 |
+
'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
|
28 |
+
'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
|
29 |
+
'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
|
30 |
+
'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
|
31 |
+
'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
|
32 |
+
'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
|
33 |
+
'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
|
34 |
+
'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
|
35 |
+
'food-other-merged', 'building-other-merged', 'rock-merged',
|
36 |
+
'wall-other-merged', 'rug-merged'
|
37 |
+
]
|
38 |
+
|
39 |
+
model = dict(bbox_head=dict(
|
40 |
+
num_classes=len(object_classes),
|
41 |
+
object_classes=object_classes,
|
42 |
+
))
|
43 |
+
|
44 |
+
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
|
45 |
+
std=[58.395, 57.12, 57.375],
|
46 |
+
to_rgb=True)
|
47 |
+
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
|
48 |
+
# from the default setting in mmdet.
|
49 |
+
train_pipeline = [
|
50 |
+
dict(type='LoadImageFromFile'),
|
51 |
+
dict(type='LoadPanopticSceneGraphAnnotations',
|
52 |
+
with_bbox=True,
|
53 |
+
with_mask=True,
|
54 |
+
with_seg=True),
|
55 |
+
dict(type='RandomFlip', flip_ratio=0.5),
|
56 |
+
dict(
|
57 |
+
type='AutoAugment',
|
58 |
+
policies=[
|
59 |
+
[
|
60 |
+
dict(type='Resize',
|
61 |
+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
|
62 |
+
(576, 1333), (608, 1333), (640, 1333),
|
63 |
+
(672, 1333), (704, 1333), (736, 1333),
|
64 |
+
(768, 1333), (800, 1333)],
|
65 |
+
multiscale_mode='value',
|
66 |
+
keep_ratio=True)
|
67 |
+
],
|
68 |
+
[
|
69 |
+
dict(type='Resize',
|
70 |
+
img_scale=[(400, 1333), (500, 1333), (600, 1333)],
|
71 |
+
multiscale_mode='value',
|
72 |
+
keep_ratio=True),
|
73 |
+
dict(type='RandomCrop',
|
74 |
+
crop_type='absolute_range',
|
75 |
+
crop_size=(384, 600),
|
76 |
+
allow_negative_crop=False), # no empty relations
|
77 |
+
dict(type='Resize',
|
78 |
+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
|
79 |
+
(576, 1333), (608, 1333), (640, 1333),
|
80 |
+
(672, 1333), (704, 1333), (736, 1333),
|
81 |
+
(768, 1333), (800, 1333)],
|
82 |
+
multiscale_mode='value',
|
83 |
+
override=True,
|
84 |
+
keep_ratio=True)
|
85 |
+
]
|
86 |
+
]),
|
87 |
+
dict(type='Normalize', **img_norm_cfg),
|
88 |
+
dict(type='Pad', size_divisor=1),
|
89 |
+
dict(type='RelsFormatBundle'),
|
90 |
+
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
|
91 |
+
]
|
92 |
+
# test_pipeline, NOTE the Pad's size_divisor is different from the default
|
93 |
+
# setting (size_divisor=32). While there is little effect on the performance
|
94 |
+
# whether we use the default setting or use size_divisor=1.
|
95 |
+
test_pipeline = [
|
96 |
+
dict(type='LoadImageFromFile'),
|
97 |
+
dict(type='MultiScaleFlipAug',
|
98 |
+
img_scale=(1333, 800),
|
99 |
+
flip=False,
|
100 |
+
transforms=[
|
101 |
+
dict(type='Resize', keep_ratio=True),
|
102 |
+
dict(type='RandomFlip'),
|
103 |
+
dict(type='Normalize', **img_norm_cfg),
|
104 |
+
dict(type='Pad', size_divisor=1),
|
105 |
+
dict(type='ImageToTensor', keys=['img']),
|
106 |
+
dict(type='Collect', keys=['img'])
|
107 |
+
])
|
108 |
+
]
|
109 |
+
data = dict(samples_per_gpu=1,
|
110 |
+
workers_per_gpu=1,
|
111 |
+
train=dict(pipeline=train_pipeline),
|
112 |
+
val=dict(pipeline=test_pipeline),
|
113 |
+
test=dict(pipeline=test_pipeline))
|
114 |
+
# optimizer
|
115 |
+
optimizer = dict(type='AdamW',
|
116 |
+
lr=0.00001,
|
117 |
+
weight_decay=0.0001,
|
118 |
+
paramwise_cfg=dict(
|
119 |
+
custom_keys={
|
120 |
+
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
|
121 |
+
'bbox_attention': dict(lr_mult=10.0, decay_mult=1.0),
|
122 |
+
'mask_head': dict(lr_mult=10.0, decay_mult=1.0)
|
123 |
+
}))
|
124 |
+
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
|
125 |
+
|
126 |
+
# learning policy
|
127 |
+
lr_config = dict(policy='step', step=8)
|
128 |
+
runner = dict(type='EpochBasedRunner', max_epochs=10)
|
129 |
+
|
130 |
+
evaluation = dict(interval=1, metric='PQ')
|
131 |
+
checkpoint_config = dict(interval=1, max_keep_ckpts=10)
|
132 |
+
|
133 |
+
project_name = 'detr4seg'
|
134 |
+
expt_name = 'test_detr4seg_r50_psg'
|
135 |
+
work_dir = f'./work_dirs/{expt_name}'
|
136 |
+
|
137 |
+
log_config = dict(
|
138 |
+
interval=50,
|
139 |
+
hooks=[
|
140 |
+
dict(type='TextLoggerHook'),
|
141 |
+
dict(type='TensorboardLoggerHook'),
|
142 |
+
dict(
|
143 |
+
type='WandbLoggerHook',
|
144 |
+
init_kwargs=dict(
|
145 |
+
project=project_name,
|
146 |
+
name=expt_name,
|
147 |
+
# config=work_dir + "/cfg.yaml"
|
148 |
+
))
|
149 |
+
],
|
150 |
+
)
|
151 |
+
|
152 |
+
load_from = 'detr_pan_r50.pth'
|
OpenPSG/configs/_base_/models/detr_r50.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='DETR',
|
3 |
+
backbone=dict(type='ResNet',
|
4 |
+
depth=50,
|
5 |
+
num_stages=4,
|
6 |
+
out_indices=(3, ),
|
7 |
+
frozen_stages=1,
|
8 |
+
norm_cfg=dict(type='BN', requires_grad=False),
|
9 |
+
norm_eval=True,
|
10 |
+
style='pytorch',
|
11 |
+
init_cfg=dict(type='Pretrained',
|
12 |
+
checkpoint='torchvision://resnet50')),
|
13 |
+
bbox_head=dict(type='DETRHead',
|
14 |
+
num_classes=80,
|
15 |
+
in_channels=2048,
|
16 |
+
transformer=dict(
|
17 |
+
type='Transformer',
|
18 |
+
encoder=dict(type='DetrTransformerEncoder',
|
19 |
+
num_layers=6,
|
20 |
+
transformerlayers=dict(
|
21 |
+
type='BaseTransformerLayer',
|
22 |
+
attn_cfgs=[
|
23 |
+
dict(type='MultiheadAttention',
|
24 |
+
embed_dims=256,
|
25 |
+
num_heads=8,
|
26 |
+
dropout=0.1)
|
27 |
+
],
|
28 |
+
feedforward_channels=2048,
|
29 |
+
ffn_dropout=0.1,
|
30 |
+
operation_order=('self_attn', 'norm',
|
31 |
+
'ffn', 'norm'))),
|
32 |
+
decoder=dict(
|
33 |
+
type='DetrTransformerDecoder',
|
34 |
+
return_intermediate=True,
|
35 |
+
num_layers=6,
|
36 |
+
transformerlayers=dict(
|
37 |
+
type='DetrTransformerDecoderLayer',
|
38 |
+
attn_cfgs=dict(type='MultiheadAttention',
|
39 |
+
embed_dims=256,
|
40 |
+
num_heads=8,
|
41 |
+
dropout=0.1),
|
42 |
+
feedforward_channels=2048,
|
43 |
+
ffn_dropout=0.1,
|
44 |
+
operation_order=('self_attn', 'norm',
|
45 |
+
'cross_attn', 'norm', 'ffn',
|
46 |
+
'norm')),
|
47 |
+
)),
|
48 |
+
positional_encoding=dict(type='SinePositionalEncoding',
|
49 |
+
num_feats=128,
|
50 |
+
normalize=True),
|
51 |
+
loss_cls=dict(type='CrossEntropyLoss',
|
52 |
+
bg_cls_weight=0.1,
|
53 |
+
use_sigmoid=False,
|
54 |
+
loss_weight=1.0,
|
55 |
+
class_weight=1.0),
|
56 |
+
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
|
57 |
+
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
|
58 |
+
# training and testing settings
|
59 |
+
train_cfg=dict(assigner=dict(
|
60 |
+
type='HungarianAssigner',
|
61 |
+
cls_cost=dict(type='ClassificationCost', weight=1.),
|
62 |
+
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
|
63 |
+
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
|
64 |
+
test_cfg=dict(max_per_img=100))
|
OpenPSG/configs/_base_/models/mask_rcnn_r50_fpn.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
model = dict(
|
3 |
+
type='MaskRCNN',
|
4 |
+
backbone=dict(type='ResNet',
|
5 |
+
depth=50,
|
6 |
+
num_stages=4,
|
7 |
+
out_indices=(0, 1, 2, 3),
|
8 |
+
frozen_stages=1,
|
9 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
10 |
+
norm_eval=True,
|
11 |
+
style='pytorch',
|
12 |
+
init_cfg=dict(type='Pretrained',
|
13 |
+
checkpoint='torchvision://resnet50')),
|
14 |
+
neck=dict(type='FPN',
|
15 |
+
in_channels=[256, 512, 1024, 2048],
|
16 |
+
out_channels=256,
|
17 |
+
num_outs=5),
|
18 |
+
rpn_head=dict(type='RPNHead',
|
19 |
+
in_channels=256,
|
20 |
+
feat_channels=256,
|
21 |
+
anchor_generator=dict(type='AnchorGenerator',
|
22 |
+
scales=[8],
|
23 |
+
ratios=[0.5, 1.0, 2.0],
|
24 |
+
strides=[4, 8, 16, 32, 64]),
|
25 |
+
bbox_coder=dict(type='DeltaXYWHBBoxCoder',
|
26 |
+
target_means=[.0, .0, .0, .0],
|
27 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
28 |
+
loss_cls=dict(type='CrossEntropyLoss',
|
29 |
+
use_sigmoid=True,
|
30 |
+
loss_weight=1.0),
|
31 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
32 |
+
roi_head=dict(type='StandardRoIHead',
|
33 |
+
bbox_roi_extractor=dict(type='SingleRoIExtractor',
|
34 |
+
roi_layer=dict(type='RoIAlign',
|
35 |
+
output_size=7,
|
36 |
+
sampling_ratio=0),
|
37 |
+
out_channels=256,
|
38 |
+
featmap_strides=[4, 8, 16, 32]),
|
39 |
+
bbox_head=dict(
|
40 |
+
type='Shared2FCBBoxHead',
|
41 |
+
in_channels=256,
|
42 |
+
fc_out_channels=1024,
|
43 |
+
roi_feat_size=7,
|
44 |
+
num_classes=80,
|
45 |
+
bbox_coder=dict(type='DeltaXYWHBBoxCoder',
|
46 |
+
target_means=[0., 0., 0., 0.],
|
47 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
48 |
+
reg_class_agnostic=False,
|
49 |
+
loss_cls=dict(type='CrossEntropyLoss',
|
50 |
+
use_sigmoid=False,
|
51 |
+
loss_weight=1.0),
|
52 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
53 |
+
mask_roi_extractor=dict(type='SingleRoIExtractor',
|
54 |
+
roi_layer=dict(type='RoIAlign',
|
55 |
+
output_size=14,
|
56 |
+
sampling_ratio=0),
|
57 |
+
out_channels=256,
|
58 |
+
featmap_strides=[4, 8, 16, 32]),
|
59 |
+
mask_head=dict(type='FCNMaskHead',
|
60 |
+
num_convs=4,
|
61 |
+
in_channels=256,
|
62 |
+
conv_out_channels=256,
|
63 |
+
num_classes=80,
|
64 |
+
loss_mask=dict(type='CrossEntropyLoss',
|
65 |
+
use_mask=True,
|
66 |
+
loss_weight=1.0))),
|
67 |
+
# model training and testing settings
|
68 |
+
train_cfg=dict(rpn=dict(assigner=dict(type='MaxIoUAssigner',
|
69 |
+
pos_iou_thr=0.7,
|
70 |
+
neg_iou_thr=0.3,
|
71 |
+
min_pos_iou=0.3,
|
72 |
+
match_low_quality=True,
|
73 |
+
ignore_iof_thr=-1),
|
74 |
+
sampler=dict(type='RandomSampler',
|
75 |
+
num=256,
|
76 |
+
pos_fraction=0.5,
|
77 |
+
neg_pos_ub=-1,
|
78 |
+
add_gt_as_proposals=False),
|
79 |
+
allowed_border=-1,
|
80 |
+
pos_weight=-1,
|
81 |
+
debug=False),
|
82 |
+
rpn_proposal=dict(nms_pre=2000,
|
83 |
+
max_per_img=1000,
|
84 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
85 |
+
min_bbox_size=0),
|
86 |
+
rcnn=dict(assigner=dict(type='MaxIoUAssigner',
|
87 |
+
pos_iou_thr=0.5,
|
88 |
+
neg_iou_thr=0.5,
|
89 |
+
min_pos_iou=0.5,
|
90 |
+
match_low_quality=True,
|
91 |
+
ignore_iof_thr=-1),
|
92 |
+
sampler=dict(type='RandomSampler',
|
93 |
+
num=512,
|
94 |
+
pos_fraction=0.25,
|
95 |
+
neg_pos_ub=-1,
|
96 |
+
add_gt_as_proposals=True),
|
97 |
+
mask_size=28,
|
98 |
+
pos_weight=-1,
|
99 |
+
debug=False)),
|
100 |
+
test_cfg=dict(rpn=dict(nms_pre=1000,
|
101 |
+
max_per_img=1000,
|
102 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
103 |
+
min_bbox_size=0),
|
104 |
+
rcnn=dict(score_thr=0.05,
|
105 |
+
nms=dict(type='nms', iou_threshold=0.5),
|
106 |
+
max_per_img=100,
|
107 |
+
mask_thr_binary=0.5)))
|
OpenPSG/configs/_base_/models/panoptic_fpn_r101_fpn_psg.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './panoptic_fpn_r50_fpn_psg.py'
|
2 |
+
|
3 |
+
model = dict(backbone=dict(
|
4 |
+
depth=101,
|
5 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
|
6 |
+
|
7 |
+
expt_name = 'panoptic_fpn_r101_fpn_psg'
|
8 |
+
load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
|
OpenPSG/configs/_base_/models/panoptic_fpn_r50_fpn_psg.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../models/mask_rcnn_r50_fpn.py',
|
3 |
+
'../datasets/psg_panoptic.py',
|
4 |
+
'../schedules/schedule_1x.py',
|
5 |
+
'../custom_runtime.py',
|
6 |
+
]
|
7 |
+
|
8 |
+
model = dict(
|
9 |
+
type='PanopticFPN',
|
10 |
+
semantic_head=dict(
|
11 |
+
type='PanopticFPNHead',
|
12 |
+
num_things_classes=80,
|
13 |
+
num_stuff_classes=53,
|
14 |
+
in_channels=256,
|
15 |
+
inner_channels=128,
|
16 |
+
start_level=0,
|
17 |
+
end_level=4,
|
18 |
+
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
|
19 |
+
conv_cfg=None,
|
20 |
+
loss_seg=dict(type='CrossEntropyLoss',
|
21 |
+
ignore_index=255,
|
22 |
+
loss_weight=0.5),
|
23 |
+
),
|
24 |
+
panoptic_fusion_head=dict(type='HeuristicFusionHead',
|
25 |
+
num_things_classes=80,
|
26 |
+
num_stuff_classes=53),
|
27 |
+
test_cfg=dict(panoptic=dict(
|
28 |
+
score_thr=0.6,
|
29 |
+
max_per_img=100,
|
30 |
+
mask_thr_binary=0.5,
|
31 |
+
mask_overlap=0.5,
|
32 |
+
nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True),
|
33 |
+
stuff_area_limit=4096,
|
34 |
+
)),
|
35 |
+
)
|
36 |
+
|
37 |
+
custom_hooks = []
|
38 |
+
|
39 |
+
# Change batch size and learning rate
|
40 |
+
data = dict(samples_per_gpu=8,
|
41 |
+
# workers_per_gpu=2
|
42 |
+
)
|
43 |
+
# optimizer = dict(lr=0.02)
|
44 |
+
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
|
45 |
+
optimizer_config = dict(_delete_=True,
|
46 |
+
grad_clip=dict(max_norm=35, norm_type=2))
|
47 |
+
|
48 |
+
lr_config = dict(policy='step',
|
49 |
+
warmup='linear',
|
50 |
+
warmup_iters=500,
|
51 |
+
warmup_ratio=1.0 / 3,
|
52 |
+
step=[8, 11])
|
53 |
+
|
54 |
+
project_name = 'openpsg'
|
55 |
+
expt_name = 'panoptic_fpn_r50_fpn_psg'
|
56 |
+
work_dir = f'./work_dirs/{expt_name}'
|
57 |
+
|
58 |
+
log_config = dict(
|
59 |
+
interval=50,
|
60 |
+
hooks=[
|
61 |
+
dict(type='TextLoggerHook'),
|
62 |
+
# dict(type='TensorboardLoggerHook')
|
63 |
+
dict(
|
64 |
+
type='WandbLoggerHook',
|
65 |
+
init_kwargs=dict(
|
66 |
+
project=project_name,
|
67 |
+
name=expt_name,
|
68 |
+
# config=work_dir + "/cfg.yaml"
|
69 |
+
),
|
70 |
+
),
|
71 |
+
],
|
72 |
+
)
|
73 |
+
|
74 |
+
load_from = 'work_dirs/checkpoints/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth'
|
OpenPSG/configs/_base_/models/psgtr_r101.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './psgtr_r50.py'
|
2 |
+
|
3 |
+
model = dict(backbone=dict(
|
4 |
+
depth=101,
|
5 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
|
OpenPSG/configs/_base_/models/psgtr_r50.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='PSGTr',
|
3 |
+
backbone=dict(type='ResNet',
|
4 |
+
depth=50,
|
5 |
+
num_stages=4,
|
6 |
+
out_indices=(0, 1, 2, 3),
|
7 |
+
frozen_stages=1,
|
8 |
+
norm_cfg=dict(type='BN', requires_grad=False),
|
9 |
+
norm_eval=True,
|
10 |
+
style='pytorch',
|
11 |
+
init_cfg=dict(type='Pretrained',
|
12 |
+
checkpoint='torchvision://resnet50')),
|
13 |
+
bbox_head=dict(type='PSGTrHead',
|
14 |
+
num_classes=80,
|
15 |
+
num_relations=117,
|
16 |
+
in_channels=2048,
|
17 |
+
transformer=dict(
|
18 |
+
type='Transformer',
|
19 |
+
encoder=dict(type='DetrTransformerEncoder',
|
20 |
+
num_layers=6,
|
21 |
+
transformerlayers=dict(
|
22 |
+
type='BaseTransformerLayer',
|
23 |
+
attn_cfgs=[
|
24 |
+
dict(type='MultiheadAttention',
|
25 |
+
embed_dims=256,
|
26 |
+
num_heads=8,
|
27 |
+
dropout=0.1)
|
28 |
+
],
|
29 |
+
feedforward_channels=2048,
|
30 |
+
ffn_dropout=0.1,
|
31 |
+
operation_order=('self_attn', 'norm',
|
32 |
+
'ffn', 'norm'))),
|
33 |
+
decoder=dict(
|
34 |
+
type='DetrTransformerDecoder',
|
35 |
+
return_intermediate=True,
|
36 |
+
num_layers=6,
|
37 |
+
transformerlayers=dict(
|
38 |
+
type='DetrTransformerDecoderLayer',
|
39 |
+
attn_cfgs=dict(type='MultiheadAttention',
|
40 |
+
embed_dims=256,
|
41 |
+
num_heads=8,
|
42 |
+
dropout=0.1),
|
43 |
+
feedforward_channels=2048,
|
44 |
+
ffn_dropout=0.1,
|
45 |
+
operation_order=('self_attn', 'norm',
|
46 |
+
'cross_attn', 'norm', 'ffn',
|
47 |
+
'norm')),
|
48 |
+
)),
|
49 |
+
positional_encoding=dict(type='SinePositionalEncoding',
|
50 |
+
num_feats=128,
|
51 |
+
normalize=True),
|
52 |
+
sub_loss_cls=dict(type='CrossEntropyLoss',
|
53 |
+
use_sigmoid=False,
|
54 |
+
loss_weight=1.0,
|
55 |
+
class_weight=1.0),
|
56 |
+
sub_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
|
57 |
+
sub_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
|
58 |
+
sub_focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
|
59 |
+
sub_dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0),
|
60 |
+
obj_loss_cls=dict(type='CrossEntropyLoss',
|
61 |
+
use_sigmoid=False,
|
62 |
+
loss_weight=1.0,
|
63 |
+
class_weight=1.0),
|
64 |
+
obj_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
|
65 |
+
obj_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
|
66 |
+
obj_focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
|
67 |
+
obj_dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0),
|
68 |
+
rel_loss_cls=dict(type='CrossEntropyLoss',
|
69 |
+
use_sigmoid=False,
|
70 |
+
loss_weight=2.0,
|
71 |
+
class_weight=1.0)),
|
72 |
+
# training and testing settings
|
73 |
+
train_cfg=dict(assigner=dict(
|
74 |
+
type='HTriMatcher',
|
75 |
+
s_cls_cost=dict(type='ClassificationCost', weight=1.),
|
76 |
+
s_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
|
77 |
+
s_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
|
78 |
+
o_cls_cost=dict(type='ClassificationCost', weight=1.),
|
79 |
+
o_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
|
80 |
+
o_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
|
81 |
+
r_cls_cost=dict(type='ClassificationCost', weight=2.))),
|
82 |
+
test_cfg=dict(max_per_img=100))
|
OpenPSG/configs/_base_/schedules/schedule_1x.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# optimizer
|
2 |
+
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
|
3 |
+
optimizer_config = dict(grad_clip=None)
|
4 |
+
# learning policy
|
5 |
+
lr_config = dict(policy='step',
|
6 |
+
warmup='linear',
|
7 |
+
warmup_iters=500,
|
8 |
+
warmup_ratio=0.001,
|
9 |
+
step=[8, 11])
|
10 |
+
runner = dict(type='EpochBasedRunner', max_epochs=12)
|
OpenPSG/configs/_base_/schedules/schedule_3x.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# optimizer
|
2 |
+
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
|
3 |
+
optimizer_config = dict(grad_clip=None)
|
4 |
+
# learning policy
|
5 |
+
lr_config = dict(policy='step',
|
6 |
+
warmup='linear',
|
7 |
+
warmup_iters=1000,
|
8 |
+
warmup_ratio=0.001,
|
9 |
+
step=[27, 33])
|
10 |
+
runner = dict(type='EpochBasedRunner', max_epochs=36)
|
OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_predcls_psg.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
|
2 |
+
|
3 |
+
model = dict(backbone=dict(
|
4 |
+
depth=101,
|
5 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
|
6 |
+
|
7 |
+
# Log config
|
8 |
+
project_name = 'openpsg'
|
9 |
+
expt_name = 'gpsnet_panoptic_fpn_r101_fpn_1x_predcls_psg'
|
10 |
+
work_dir = f'./work_dirs/{expt_name}'
|
11 |
+
|
12 |
+
log_config = dict(
|
13 |
+
interval=50,
|
14 |
+
hooks=[
|
15 |
+
dict(type='TextLoggerHook'),
|
16 |
+
dict(
|
17 |
+
type='WandbLoggerHook',
|
18 |
+
init_kwargs=dict(
|
19 |
+
project=project_name,
|
20 |
+
name=expt_name,
|
21 |
+
),
|
22 |
+
),
|
23 |
+
],
|
24 |
+
)
|
25 |
+
|
26 |
+
load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
|
OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
|
2 |
+
|
3 |
+
model = dict(backbone=dict(
|
4 |
+
depth=101,
|
5 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
|
6 |
+
|
7 |
+
# Log config
|
8 |
+
project_name = 'openpsg'
|
9 |
+
expt_name = 'gpsnet_panoptic_fpn_r101_fpn_1x_sgdet_psg'
|
10 |
+
work_dir = f'./work_dirs/{expt_name}'
|
11 |
+
|
12 |
+
log_config = dict(
|
13 |
+
interval=50,
|
14 |
+
hooks=[
|
15 |
+
dict(type='TextLoggerHook'),
|
16 |
+
dict(
|
17 |
+
type='WandbLoggerHook',
|
18 |
+
init_kwargs=dict(
|
19 |
+
project=project_name,
|
20 |
+
name=expt_name,
|
21 |
+
),
|
22 |
+
),
|
23 |
+
],
|
24 |
+
)
|
25 |
+
|
26 |
+
load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
|
OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_predcls_psg.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
|
3 |
+
]
|
4 |
+
|
5 |
+
model = dict(relation_head=dict(
|
6 |
+
type='GPSHead',
|
7 |
+
head_config=dict(
|
8 |
+
# NOTE: Evaluation type
|
9 |
+
use_gt_box=True,
|
10 |
+
use_gt_label=True,
|
11 |
+
),
|
12 |
+
))
|
13 |
+
|
14 |
+
evaluation = dict(interval=1,
|
15 |
+
metric='predcls',
|
16 |
+
relation_mode=True,
|
17 |
+
classwise=True,
|
18 |
+
detection_method='pan_seg')
|
19 |
+
|
20 |
+
# Change batch size and learning rate
|
21 |
+
data = dict(samples_per_gpu=16, workers_per_gpu=0)
|
22 |
+
optimizer = dict(type='SGD', lr=0.03, momentum=0.9, weight_decay=0.0001)
|
23 |
+
|
24 |
+
# Log config
|
25 |
+
project_name = 'openpsg'
|
26 |
+
expt_name = 'gpsnet_panoptic_fpn_r50_fpn_1x_predcls_psg'
|
27 |
+
work_dir = f'./work_dirs/{expt_name}'
|
28 |
+
|
29 |
+
log_config = dict(
|
30 |
+
interval=50,
|
31 |
+
hooks=[
|
32 |
+
dict(type='TextLoggerHook'),
|
33 |
+
dict(
|
34 |
+
type='WandbLoggerHook',
|
35 |
+
init_kwargs=dict(
|
36 |
+
project=project_name,
|
37 |
+
name=expt_name,
|
38 |
+
),
|
39 |
+
),
|
40 |
+
],
|
41 |
+
)
|
OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
|
3 |
+
]
|
4 |
+
|
5 |
+
model = dict(
|
6 |
+
relation_head=dict(
|
7 |
+
type='GPSHead',
|
8 |
+
head_config=dict(
|
9 |
+
# NOTE: Evaluation type
|
10 |
+
use_gt_box=False,
|
11 |
+
use_gt_label=False,
|
12 |
+
),
|
13 |
+
),
|
14 |
+
roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ),
|
15 |
+
)
|
16 |
+
|
17 |
+
evaluation = dict(
|
18 |
+
interval=1,
|
19 |
+
metric='sgdet',
|
20 |
+
relation_mode=True,
|
21 |
+
classwise=True,
|
22 |
+
iou_thrs=0.5,
|
23 |
+
detection_method='pan_seg',
|
24 |
+
)
|
25 |
+
|
26 |
+
data = dict(samples_per_gpu=16)
|
27 |
+
|
28 |
+
# Log config
|
29 |
+
project_name = 'openpsg'
|
30 |
+
expt_name = 'gpsnet_panoptic_fpn_r50_fpn_1x_sgdet_psg'
|
31 |
+
work_dir = f'./work_dirs/{expt_name}'
|
32 |
+
|
33 |
+
log_config = dict(
|
34 |
+
interval=50,
|
35 |
+
hooks=[
|
36 |
+
dict(type='TextLoggerHook'),
|
37 |
+
dict(
|
38 |
+
type='WandbLoggerHook',
|
39 |
+
init_kwargs=dict(
|
40 |
+
project=project_name,
|
41 |
+
name=expt_name,
|
42 |
+
),
|
43 |
+
),
|
44 |
+
],
|
45 |
+
)
|
OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_predcls_psg.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
|
2 |
+
|
3 |
+
model = dict(backbone=dict(
|
4 |
+
depth=101,
|
5 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
|
6 |
+
|
7 |
+
# Log config
|
8 |
+
project_name = 'openpsg'
|
9 |
+
expt_name = 'imp_panoptic_fpn_r101_fpn_1x_predcls_psg'
|
10 |
+
work_dir = f'./work_dirs/{expt_name}'
|
11 |
+
|
12 |
+
log_config = dict(
|
13 |
+
interval=50,
|
14 |
+
hooks=[
|
15 |
+
dict(type='TextLoggerHook'),
|
16 |
+
# dict(type='TensorboardLoggerHook')
|
17 |
+
dict(
|
18 |
+
type='WandbLoggerHook',
|
19 |
+
init_kwargs=dict(
|
20 |
+
project=project_name,
|
21 |
+
name=expt_name,
|
22 |
+
# config=work_dir + "/cfg.yaml"
|
23 |
+
),
|
24 |
+
),
|
25 |
+
],
|
26 |
+
)
|
27 |
+
|
28 |
+
load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
|
OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
|
2 |
+
|
3 |
+
model = dict(backbone=dict(
|
4 |
+
depth=101,
|
5 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
|
6 |
+
|
7 |
+
# Log config
|
8 |
+
project_name = 'openpsg'
|
9 |
+
expt_name = 'imp_panoptic_fpn_r101_fpn_1x_sgdet_psg'
|
10 |
+
work_dir = f'./work_dirs/{expt_name}'
|
11 |
+
|
12 |
+
log_config = dict(
|
13 |
+
interval=50,
|
14 |
+
hooks=[
|
15 |
+
dict(type='TextLoggerHook'),
|
16 |
+
dict(
|
17 |
+
type='WandbLoggerHook',
|
18 |
+
init_kwargs=dict(
|
19 |
+
project=project_name,
|
20 |
+
name=expt_name,
|
21 |
+
),
|
22 |
+
),
|
23 |
+
],
|
24 |
+
)
|
25 |
+
|
26 |
+
load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
|
OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_predcls_psg.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
|
3 |
+
]
|
4 |
+
|
5 |
+
model = dict(relation_head=dict(
|
6 |
+
type='IMPHead',
|
7 |
+
head_config=dict(
|
8 |
+
# NOTE: Evaluation type
|
9 |
+
use_gt_box=True,
|
10 |
+
use_gt_label=True,
|
11 |
+
num_iter=2,
|
12 |
+
),
|
13 |
+
))
|
14 |
+
|
15 |
+
evaluation = dict(interval=1,
|
16 |
+
metric='predcls',
|
17 |
+
relation_mode=True,
|
18 |
+
classwise=True)
|
19 |
+
|
20 |
+
# Change batch size and learning rate
|
21 |
+
data = dict(samples_per_gpu=16, )
|
22 |
+
# workers_per_gpu=0) # FIXME: Is this the problem?
|
23 |
+
optimizer = dict(type='SGD', lr=0.001, momentum=0.9)
|
24 |
+
|
25 |
+
# Log config
|
26 |
+
project_name = 'openpsg'
|
27 |
+
expt_name = 'imp_panoptic_fpn_r50_fpn_1x_predcls_psg'
|
28 |
+
work_dir = f'./work_dirs/{expt_name}'
|
29 |
+
|
30 |
+
log_config = dict(
|
31 |
+
interval=50,
|
32 |
+
hooks=[
|
33 |
+
dict(type='TextLoggerHook'),
|
34 |
+
# dict(type='TensorboardLoggerHook')
|
35 |
+
dict(
|
36 |
+
type='WandbLoggerHook',
|
37 |
+
init_kwargs=dict(
|
38 |
+
project=project_name,
|
39 |
+
name=expt_name,
|
40 |
+
# config=work_dir + "/cfg.yaml"
|
41 |
+
),
|
42 |
+
),
|
43 |
+
],
|
44 |
+
)
|
OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
|
3 |
+
]
|
4 |
+
|
5 |
+
model = dict(relation_head=dict(
|
6 |
+
type='IMPHead',
|
7 |
+
head_config=dict(
|
8 |
+
# NOTE: Evaluation type
|
9 |
+
use_gt_box=False,
|
10 |
+
use_gt_label=False,
|
11 |
+
num_iter=2,
|
12 |
+
),
|
13 |
+
))
|
14 |
+
|
15 |
+
evaluation = dict(
|
16 |
+
interval=1,
|
17 |
+
metric='sgdet',
|
18 |
+
relation_mode=True,
|
19 |
+
classwise=True,
|
20 |
+
iou_thrs=0.5,
|
21 |
+
detection_method='pan_seg',
|
22 |
+
)
|
23 |
+
|
24 |
+
# Change batch size and learning rate
|
25 |
+
data = dict(samples_per_gpu=16, )
|
26 |
+
# workers_per_gpu=0) # FIXME: Is this the problem?
|
27 |
+
optimizer = dict(type='SGD', lr=0.001, momentum=0.9)
|
28 |
+
|
29 |
+
# Log config
|
30 |
+
project_name = 'openpsg'
|
31 |
+
expt_name = 'imp_panoptic_fpn_r50_fpn_1x_sgdet_psg'
|
32 |
+
work_dir = f'./work_dirs/{expt_name}'
|
33 |
+
|
34 |
+
log_config = dict(
|
35 |
+
interval=50,
|
36 |
+
hooks=[
|
37 |
+
dict(type='TextLoggerHook'),
|
38 |
+
# dict(type='TensorboardLoggerHook')
|
39 |
+
dict(
|
40 |
+
type='WandbLoggerHook',
|
41 |
+
init_kwargs=dict(
|
42 |
+
project=project_name,
|
43 |
+
name=expt_name,
|
44 |
+
# config=work_dir + "/cfg.yaml"
|
45 |
+
),
|
46 |
+
),
|
47 |
+
],
|
48 |
+
)
|
OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_predcls_psg.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
|
2 |
+
|
3 |
+
model = dict(backbone=dict(
|
4 |
+
depth=101,
|
5 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
|
6 |
+
|
7 |
+
# Log config
|
8 |
+
project_name = 'openpsg'
|
9 |
+
expt_name = 'motifs_panoptic_fpn_r101_fpn_1x_predcls_psg'
|
10 |
+
work_dir = f'./work_dirs/{expt_name}'
|
11 |
+
|
12 |
+
log_config = dict(
|
13 |
+
interval=50,
|
14 |
+
hooks=[
|
15 |
+
dict(type='TextLoggerHook'),
|
16 |
+
# dict(type='TensorboardLoggerHook')
|
17 |
+
dict(
|
18 |
+
type='WandbLoggerHook',
|
19 |
+
init_kwargs=dict(
|
20 |
+
project=project_name,
|
21 |
+
name=expt_name,
|
22 |
+
# config=work_dir + "/cfg.yaml"
|
23 |
+
),
|
24 |
+
),
|
25 |
+
],
|
26 |
+
)
|
27 |
+
|
28 |
+
load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
|
OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
|
2 |
+
|
3 |
+
model = dict(backbone=dict(
|
4 |
+
depth=101,
|
5 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
|
6 |
+
|
7 |
+
# Log config
|
8 |
+
project_name = 'openpsg'
|
9 |
+
expt_name = 'motifs_panoptic_fpn_r101_fpn_1x_sgdet_psg'
|
10 |
+
work_dir = f'./work_dirs/{expt_name}'
|
11 |
+
|
12 |
+
log_config = dict(
|
13 |
+
interval=50,
|
14 |
+
hooks=[
|
15 |
+
dict(type='TextLoggerHook'),
|
16 |
+
# dict(type='TensorboardLoggerHook')
|
17 |
+
dict(
|
18 |
+
type='WandbLoggerHook',
|
19 |
+
init_kwargs=dict(
|
20 |
+
project=project_name,
|
21 |
+
name=expt_name,
|
22 |
+
# config=work_dir + "/cfg.yaml"
|
23 |
+
),
|
24 |
+
),
|
25 |
+
],
|
26 |
+
)
|
27 |
+
|
28 |
+
load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
|
OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../_base_/models/mask_rcnn_r50_fpn.py',
|
3 |
+
'../_base_/datasets/psg.py',
|
4 |
+
'../_base_/schedules/schedule_1x.py',
|
5 |
+
'../_base_/custom_runtime.py',
|
6 |
+
]
|
7 |
+
|
8 |
+
find_unused_parameters = True
|
9 |
+
dataset_type = 'PanopticSceneGraphDataset'
|
10 |
+
|
11 |
+
# HACK:
|
12 |
+
object_classes = [
|
13 |
+
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
|
14 |
+
'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
|
15 |
+
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
|
16 |
+
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
|
17 |
+
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
|
18 |
+
'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
|
19 |
+
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
|
20 |
+
'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
|
21 |
+
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
|
22 |
+
'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
|
23 |
+
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
|
24 |
+
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
|
25 |
+
'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
|
26 |
+
'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
|
27 |
+
'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
|
28 |
+
'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
|
29 |
+
'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
|
30 |
+
'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
|
31 |
+
'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
|
32 |
+
'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
|
33 |
+
'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
|
34 |
+
'food-other-merged', 'building-other-merged', 'rock-merged',
|
35 |
+
'wall-other-merged', 'rug-merged'
|
36 |
+
]
|
37 |
+
|
38 |
+
predicate_classes = [
|
39 |
+
'over',
|
40 |
+
'in front of',
|
41 |
+
'beside',
|
42 |
+
'on',
|
43 |
+
'in',
|
44 |
+
'attached to',
|
45 |
+
'hanging from',
|
46 |
+
'on back of',
|
47 |
+
'falling off',
|
48 |
+
'going down',
|
49 |
+
'painted on',
|
50 |
+
'walking on',
|
51 |
+
'running on',
|
52 |
+
'crossing',
|
53 |
+
'standing on',
|
54 |
+
'lying on',
|
55 |
+
'sitting on',
|
56 |
+
'flying over',
|
57 |
+
'jumping over',
|
58 |
+
'jumping from',
|
59 |
+
'wearing',
|
60 |
+
'holding',
|
61 |
+
'carrying',
|
62 |
+
'looking at',
|
63 |
+
'guiding',
|
64 |
+
'kissing',
|
65 |
+
'eating',
|
66 |
+
'drinking',
|
67 |
+
'feeding',
|
68 |
+
'biting',
|
69 |
+
'catching',
|
70 |
+
'picking',
|
71 |
+
'playing with',
|
72 |
+
'chasing',
|
73 |
+
'climbing',
|
74 |
+
'cleaning',
|
75 |
+
'playing',
|
76 |
+
'touching',
|
77 |
+
'pushing',
|
78 |
+
'pulling',
|
79 |
+
'opening',
|
80 |
+
'cooking',
|
81 |
+
'talking to',
|
82 |
+
'throwing',
|
83 |
+
'slicing',
|
84 |
+
'driving',
|
85 |
+
'riding',
|
86 |
+
'parked on',
|
87 |
+
'driving on',
|
88 |
+
'about to hit',
|
89 |
+
'kicking',
|
90 |
+
'swinging',
|
91 |
+
'entering',
|
92 |
+
'exiting',
|
93 |
+
'enclosing',
|
94 |
+
'leaning on',
|
95 |
+
]
|
96 |
+
|
97 |
+
model = dict(
|
98 |
+
type='SceneGraphPanopticFPN',
|
99 |
+
semantic_head=dict(
|
100 |
+
type='PanopticFPNHead',
|
101 |
+
num_things_classes=80,
|
102 |
+
num_stuff_classes=53,
|
103 |
+
in_channels=256,
|
104 |
+
inner_channels=128,
|
105 |
+
start_level=0,
|
106 |
+
end_level=4,
|
107 |
+
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
|
108 |
+
conv_cfg=None,
|
109 |
+
loss_seg=dict(type='CrossEntropyLoss',
|
110 |
+
ignore_index=255,
|
111 |
+
loss_weight=0.5),
|
112 |
+
),
|
113 |
+
panoptic_fusion_head=dict(type='HeuristicFusionHead',
|
114 |
+
num_things_classes=80,
|
115 |
+
num_stuff_classes=53),
|
116 |
+
test_cfg=dict(panoptic=dict(
|
117 |
+
score_thr=0.6,
|
118 |
+
max_per_img=100,
|
119 |
+
mask_thr_binary=0.5,
|
120 |
+
mask_overlap=0.5,
|
121 |
+
nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True),
|
122 |
+
stuff_area_limit=4096,
|
123 |
+
)),
|
124 |
+
relation_head=dict(
|
125 |
+
type='MotifHead',
|
126 |
+
object_classes=object_classes,
|
127 |
+
predicate_classes=predicate_classes,
|
128 |
+
num_classes=len(object_classes) + 1, # with background class
|
129 |
+
num_predicates=len(predicate_classes) + 1,
|
130 |
+
use_bias=False, # NOTE: whether to use frequency bias
|
131 |
+
head_config=dict(
|
132 |
+
# NOTE: Evaluation type
|
133 |
+
use_gt_box=True,
|
134 |
+
use_gt_label=True,
|
135 |
+
use_vision=True,
|
136 |
+
embed_dim=200,
|
137 |
+
hidden_dim=512,
|
138 |
+
roi_dim=1024,
|
139 |
+
context_pooling_dim=4096,
|
140 |
+
dropout_rate=0.2,
|
141 |
+
context_object_layer=1,
|
142 |
+
context_edge_layer=1,
|
143 |
+
glove_dir='data/glove/',
|
144 |
+
causal_effect_analysis=False,
|
145 |
+
),
|
146 |
+
bbox_roi_extractor=dict(
|
147 |
+
type='VisualSpatialExtractor',
|
148 |
+
bbox_roi_layer=dict(type='RoIAlign',
|
149 |
+
output_size=7,
|
150 |
+
sampling_ratio=2),
|
151 |
+
with_visual_bbox=True,
|
152 |
+
with_visual_mask=False,
|
153 |
+
with_visual_point=False,
|
154 |
+
with_spatial=False,
|
155 |
+
in_channels=256,
|
156 |
+
fc_out_channels=1024,
|
157 |
+
featmap_strides=[4, 8, 16, 32],
|
158 |
+
),
|
159 |
+
relation_roi_extractor=dict(
|
160 |
+
type='VisualSpatialExtractor',
|
161 |
+
bbox_roi_layer=dict(type='RoIAlign',
|
162 |
+
output_size=7,
|
163 |
+
sampling_ratio=2),
|
164 |
+
with_visual_bbox=True,
|
165 |
+
with_visual_mask=False,
|
166 |
+
with_visual_point=False,
|
167 |
+
with_spatial=True,
|
168 |
+
separate_spatial=False,
|
169 |
+
in_channels=256,
|
170 |
+
fc_out_channels=1024,
|
171 |
+
featmap_strides=[4, 8, 16, 32],
|
172 |
+
),
|
173 |
+
relation_sampler=dict(
|
174 |
+
type='Motif',
|
175 |
+
pos_iou_thr=0.5,
|
176 |
+
require_overlap=False, # for sgdet training, not require
|
177 |
+
num_sample_per_gt_rel=4,
|
178 |
+
num_rel_per_image=1024,
|
179 |
+
pos_fraction=0.25,
|
180 |
+
# NOTE: To only include overlapping bboxes?
|
181 |
+
test_overlap=False, # for testing
|
182 |
+
),
|
183 |
+
loss_object=dict(type='CrossEntropyLoss',
|
184 |
+
use_sigmoid=False,
|
185 |
+
loss_weight=1.0),
|
186 |
+
loss_relation=dict(type='CrossEntropyLoss',
|
187 |
+
use_sigmoid=False,
|
188 |
+
loss_weight=1.0),
|
189 |
+
),
|
190 |
+
)
|
191 |
+
|
192 |
+
custom_hooks = []
|
193 |
+
|
194 |
+
# To freeze modules
|
195 |
+
freeze_modules = [
|
196 |
+
'backbone',
|
197 |
+
'neck',
|
198 |
+
'rpn_head',
|
199 |
+
'roi_head',
|
200 |
+
'semantic_head',
|
201 |
+
'panoptic_fusion_head',
|
202 |
+
]
|
203 |
+
|
204 |
+
evaluation = dict(interval=1,
|
205 |
+
metric='predcls',
|
206 |
+
relation_mode=True,
|
207 |
+
classwise=True)
|
208 |
+
|
209 |
+
# Change batch size and learning rate
|
210 |
+
data = dict(samples_per_gpu=16, )
|
211 |
+
# optimizer = dict(lr=0.003)
|
212 |
+
optimizer = dict(type='SGD', lr=0.03, momentum=0.9, weight_decay=0.0001)
|
213 |
+
optimizer_config = dict(_delete_=True,
|
214 |
+
grad_clip=dict(max_norm=35, norm_type=2))
|
215 |
+
|
216 |
+
lr_config = dict(policy='step',
|
217 |
+
warmup='linear',
|
218 |
+
warmup_iters=500,
|
219 |
+
warmup_ratio=1.0 / 3,
|
220 |
+
step=[7, 10])
|
221 |
+
|
222 |
+
# Log config
|
223 |
+
project_name = 'openpsg'
|
224 |
+
expt_name = 'motifs_panoptic_fpn_r50_fpn_1x_predcls_psg'
|
225 |
+
work_dir = f'./work_dirs/{expt_name}'
|
226 |
+
|
227 |
+
log_config = dict(
|
228 |
+
interval=50,
|
229 |
+
hooks=[
|
230 |
+
dict(type='TextLoggerHook'),
|
231 |
+
dict(
|
232 |
+
type='WandbLoggerHook',
|
233 |
+
init_kwargs=dict(
|
234 |
+
project=project_name,
|
235 |
+
name=expt_name,
|
236 |
+
),
|
237 |
+
),
|
238 |
+
],
|
239 |
+
)
|
240 |
+
|
241 |
+
load_from = 'work_dirs/checkpoints/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth'
|
OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'./panoptic_fpn_r50_fpn_1x_predcls_psg.py',
|
3 |
+
]
|
4 |
+
|
5 |
+
model = dict(
|
6 |
+
relation_head=dict(
|
7 |
+
head_config=dict(
|
8 |
+
# NOTE: Evaluation type
|
9 |
+
use_gt_box=False,
|
10 |
+
use_gt_label=False,
|
11 |
+
), ),
|
12 |
+
roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ),
|
13 |
+
)
|
14 |
+
|
15 |
+
evaluation = dict(interval=1,
|
16 |
+
metric='sgdet',
|
17 |
+
relation_mode=True,
|
18 |
+
classwise=True,
|
19 |
+
iou_thrs=0.5,
|
20 |
+
detection_method='pan_seg')
|
21 |
+
|
22 |
+
# Change batch size and learning rate
|
23 |
+
data = dict(samples_per_gpu=8,
|
24 |
+
# workers_per_gpu=2
|
25 |
+
)
|
26 |
+
|
27 |
+
# Log config
|
28 |
+
project_name = 'openpsg'
|
29 |
+
expt_name = 'motifs_panoptic_fpn_r50_fpn_1x_sgdet_psg'
|
30 |
+
work_dir = f'./work_dirs/{expt_name}'
|
31 |
+
|
32 |
+
log_config = dict(
|
33 |
+
interval=50,
|
34 |
+
hooks=[
|
35 |
+
dict(type='TextLoggerHook'),
|
36 |
+
dict(
|
37 |
+
type='WandbLoggerHook',
|
38 |
+
init_kwargs=dict(
|
39 |
+
project=project_name,
|
40 |
+
name=expt_name,
|
41 |
+
),
|
42 |
+
),
|
43 |
+
],
|
44 |
+
)
|
OpenPSG/configs/psgformer/psgformer_r101_psg.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './psgformer_r50_psg.py'
|
2 |
+
|
3 |
+
model = dict(backbone=dict(
|
4 |
+
depth=101,
|
5 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
|
6 |
+
|
7 |
+
# learning policy
|
8 |
+
lr_config = dict(policy='step', step=48)
|
9 |
+
runner = dict(type='EpochBasedRunner', max_epochs=60)
|
10 |
+
|
11 |
+
project_name = 'psgformer'
|
12 |
+
expt_name = 'psgformer_r101_psg'
|
13 |
+
work_dir = f'./work_dirs/{expt_name}'
|
14 |
+
checkpoint_config = dict(interval=12, max_keep_ckpts=10)
|
15 |
+
|
16 |
+
load_from = './work_dirs/checkpoints/detr4psgformer_r101.pth'
|
OpenPSG/configs/psgformer/psgformer_r50.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='PSGTr',
|
3 |
+
backbone=dict(type='ResNet',
|
4 |
+
depth=50,
|
5 |
+
num_stages=4,
|
6 |
+
out_indices=(0, 1, 2, 3),
|
7 |
+
frozen_stages=1,
|
8 |
+
norm_cfg=dict(type='BN', requires_grad=False),
|
9 |
+
norm_eval=True,
|
10 |
+
style='pytorch',
|
11 |
+
init_cfg=dict(type='Pretrained',
|
12 |
+
checkpoint='torchvision://resnet50')),
|
13 |
+
bbox_head=dict(
|
14 |
+
type='PSGFormerHead',
|
15 |
+
num_classes=80,
|
16 |
+
num_relations=117,
|
17 |
+
in_channels=2048,
|
18 |
+
transformer=dict(
|
19 |
+
type='DualTransformer',
|
20 |
+
encoder=dict(type='DetrTransformerEncoder',
|
21 |
+
num_layers=6,
|
22 |
+
transformerlayers=dict(
|
23 |
+
type='BaseTransformerLayer',
|
24 |
+
attn_cfgs=[
|
25 |
+
dict(type='MultiheadAttention',
|
26 |
+
embed_dims=256,
|
27 |
+
num_heads=8,
|
28 |
+
dropout=0.1)
|
29 |
+
],
|
30 |
+
feedforward_channels=2048,
|
31 |
+
ffn_dropout=0.1,
|
32 |
+
operation_order=('self_attn', 'norm', 'ffn',
|
33 |
+
'norm'))),
|
34 |
+
decoder1=dict(type='DetrTransformerDecoder',
|
35 |
+
return_intermediate=True,
|
36 |
+
num_layers=6,
|
37 |
+
transformerlayers=dict(
|
38 |
+
type='DetrTransformerDecoderLayer',
|
39 |
+
attn_cfgs=dict(type='MultiheadAttention',
|
40 |
+
embed_dims=256,
|
41 |
+
num_heads=8,
|
42 |
+
dropout=0.1),
|
43 |
+
feedforward_channels=2048,
|
44 |
+
ffn_dropout=0.1,
|
45 |
+
operation_order=('self_attn', 'norm',
|
46 |
+
'cross_attn', 'norm', 'ffn',
|
47 |
+
'norm'))),
|
48 |
+
decoder2=dict(type='DetrTransformerDecoder',
|
49 |
+
return_intermediate=True,
|
50 |
+
num_layers=6,
|
51 |
+
transformerlayers=dict(
|
52 |
+
type='DetrTransformerDecoderLayer',
|
53 |
+
attn_cfgs=dict(type='MultiheadAttention',
|
54 |
+
embed_dims=256,
|
55 |
+
num_heads=8,
|
56 |
+
dropout=0.1),
|
57 |
+
feedforward_channels=2048,
|
58 |
+
ffn_dropout=0.1,
|
59 |
+
operation_order=('self_attn', 'norm',
|
60 |
+
'cross_attn', 'norm', 'ffn',
|
61 |
+
'norm'))),
|
62 |
+
),
|
63 |
+
positional_encoding=dict(type='SinePositionalEncoding',
|
64 |
+
num_feats=128,
|
65 |
+
normalize=True),
|
66 |
+
rel_loss_cls=dict(type='CrossEntropyLoss',
|
67 |
+
use_sigmoid=False,
|
68 |
+
loss_weight=2.0,
|
69 |
+
class_weight=1.0),
|
70 |
+
sub_id_loss=dict(type='MultilabelCrossEntropy', loss_weight=2.0),
|
71 |
+
obj_id_loss=dict(type='MultilabelCrossEntropy', loss_weight=2.0),
|
72 |
+
loss_cls=dict(type='CrossEntropyLoss',
|
73 |
+
use_sigmoid=False,
|
74 |
+
loss_weight=4.0,
|
75 |
+
class_weight=1.0),
|
76 |
+
loss_bbox=dict(type='L1Loss', loss_weight=3.0),
|
77 |
+
loss_iou=dict(type='GIoULoss', loss_weight=2.0),
|
78 |
+
focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
|
79 |
+
dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0)),
|
80 |
+
# training and testing settings
|
81 |
+
train_cfg=dict(id_assigner=dict(type='IdMatcher',
|
82 |
+
sub_id_cost=dict(type='ClassificationCost',
|
83 |
+
weight=1.),
|
84 |
+
obj_id_cost=dict(type='ClassificationCost',
|
85 |
+
weight=1.),
|
86 |
+
r_cls_cost=dict(type='ClassificationCost',
|
87 |
+
weight=1.)),
|
88 |
+
bbox_assigner=dict(type='HungarianAssigner',
|
89 |
+
cls_cost=dict(type='ClassificationCost',
|
90 |
+
weight=4.0),
|
91 |
+
reg_cost=dict(type='BBoxL1Cost',
|
92 |
+
weight=3.0),
|
93 |
+
iou_cost=dict(type='IoUCost',
|
94 |
+
iou_mode='giou',
|
95 |
+
weight=2.0))),
|
96 |
+
test_cfg=dict(max_per_img=100))
|
OpenPSG/configs/psgformer/psgformer_r50_psg.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'./psgformer_r50.py', '../_base_/datasets/psg.py',
|
3 |
+
'../_base_/custom_runtime.py'
|
4 |
+
]
|
5 |
+
|
6 |
+
find_unused_parameters = True
|
7 |
+
|
8 |
+
custom_imports = dict(imports=[
|
9 |
+
'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses',
|
10 |
+
'openpsg.models.frameworks.dual_transformer',
|
11 |
+
'openpsg.models.relation_heads.psgformer_head', 'openpsg.datasets',
|
12 |
+
'openpsg.datasets.pipelines.loading',
|
13 |
+
'openpsg.datasets.pipelines.rel_randomcrop',
|
14 |
+
'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils'
|
15 |
+
],
|
16 |
+
allow_failed_imports=False)
|
17 |
+
|
18 |
+
dataset_type = 'PanopticSceneGraphDataset'
|
19 |
+
|
20 |
+
# HACK:
|
21 |
+
object_classes = [
|
22 |
+
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
|
23 |
+
'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
|
24 |
+
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
|
25 |
+
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
|
26 |
+
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
|
27 |
+
'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
|
28 |
+
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
|
29 |
+
'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
|
30 |
+
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
|
31 |
+
'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
|
32 |
+
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
|
33 |
+
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
|
34 |
+
'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
|
35 |
+
'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
|
36 |
+
'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
|
37 |
+
'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
|
38 |
+
'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
|
39 |
+
'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
|
40 |
+
'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
|
41 |
+
'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
|
42 |
+
'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
|
43 |
+
'food-other-merged', 'building-other-merged', 'rock-merged',
|
44 |
+
'wall-other-merged', 'rug-merged'
|
45 |
+
]
|
46 |
+
|
47 |
+
predicate_classes = [
|
48 |
+
'over',
|
49 |
+
'in front of',
|
50 |
+
'beside',
|
51 |
+
'on',
|
52 |
+
'in',
|
53 |
+
'attached to',
|
54 |
+
'hanging from',
|
55 |
+
'on back of',
|
56 |
+
'falling off',
|
57 |
+
'going down',
|
58 |
+
'painted on',
|
59 |
+
'walking on',
|
60 |
+
'running on',
|
61 |
+
'crossing',
|
62 |
+
'standing on',
|
63 |
+
'lying on',
|
64 |
+
'sitting on',
|
65 |
+
'flying over',
|
66 |
+
'jumping over',
|
67 |
+
'jumping from',
|
68 |
+
'wearing',
|
69 |
+
'holding',
|
70 |
+
'carrying',
|
71 |
+
'looking at',
|
72 |
+
'guiding',
|
73 |
+
'kissing',
|
74 |
+
'eating',
|
75 |
+
'drinking',
|
76 |
+
'feeding',
|
77 |
+
'biting',
|
78 |
+
'catching',
|
79 |
+
'picking',
|
80 |
+
'playing with',
|
81 |
+
'chasing',
|
82 |
+
'climbing',
|
83 |
+
'cleaning',
|
84 |
+
'playing',
|
85 |
+
'touching',
|
86 |
+
'pushing',
|
87 |
+
'pulling',
|
88 |
+
'opening',
|
89 |
+
'cooking',
|
90 |
+
'talking to',
|
91 |
+
'throwing',
|
92 |
+
'slicing',
|
93 |
+
'driving',
|
94 |
+
'riding',
|
95 |
+
'parked on',
|
96 |
+
'driving on',
|
97 |
+
'about to hit',
|
98 |
+
'kicking',
|
99 |
+
'swinging',
|
100 |
+
'entering',
|
101 |
+
'exiting',
|
102 |
+
'enclosing',
|
103 |
+
'leaning on',
|
104 |
+
]
|
105 |
+
|
106 |
+
model = dict(bbox_head=dict(
|
107 |
+
num_classes=len(object_classes),
|
108 |
+
num_relations=len(predicate_classes),
|
109 |
+
object_classes=object_classes,
|
110 |
+
predicate_classes=predicate_classes,
|
111 |
+
num_obj_query=100,
|
112 |
+
num_rel_query=100,
|
113 |
+
), )
|
114 |
+
|
115 |
+
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
|
116 |
+
std=[58.395, 57.12, 57.375],
|
117 |
+
to_rgb=True)
|
118 |
+
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
|
119 |
+
# from the default setting in mmdet.
|
120 |
+
train_pipeline = [
|
121 |
+
dict(type='LoadImageFromFile'),
|
122 |
+
dict(type='LoadPanopticSceneGraphAnnotations',
|
123 |
+
with_bbox=True,
|
124 |
+
with_rel=True,
|
125 |
+
with_mask=True,
|
126 |
+
with_seg=True),
|
127 |
+
dict(type='RandomFlip', flip_ratio=0.5),
|
128 |
+
dict(
|
129 |
+
type='AutoAugment',
|
130 |
+
policies=[
|
131 |
+
[
|
132 |
+
dict(type='Resize',
|
133 |
+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
|
134 |
+
(576, 1333), (608, 1333), (640, 1333),
|
135 |
+
(672, 1333), (704, 1333), (736, 1333),
|
136 |
+
(768, 1333), (800, 1333)],
|
137 |
+
multiscale_mode='value',
|
138 |
+
keep_ratio=True)
|
139 |
+
],
|
140 |
+
[
|
141 |
+
dict(type='Resize',
|
142 |
+
img_scale=[(400, 1333), (500, 1333), (600, 1333)],
|
143 |
+
multiscale_mode='value',
|
144 |
+
keep_ratio=True),
|
145 |
+
dict(type='RelRandomCrop',
|
146 |
+
crop_type='absolute_range',
|
147 |
+
crop_size=(384, 600),
|
148 |
+
allow_negative_crop=False), # no empty relations
|
149 |
+
dict(type='Resize',
|
150 |
+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
|
151 |
+
(576, 1333), (608, 1333), (640, 1333),
|
152 |
+
(672, 1333), (704, 1333), (736, 1333),
|
153 |
+
(768, 1333), (800, 1333)],
|
154 |
+
multiscale_mode='value',
|
155 |
+
override=True,
|
156 |
+
keep_ratio=True)
|
157 |
+
]
|
158 |
+
]),
|
159 |
+
dict(type='Normalize', **img_norm_cfg),
|
160 |
+
dict(type='Pad', size_divisor=1),
|
161 |
+
dict(type='RelsFormatBundle'),
|
162 |
+
dict(type='Collect',
|
163 |
+
keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks'])
|
164 |
+
]
|
165 |
+
# test_pipeline, NOTE the Pad's size_divisor is different from the default
|
166 |
+
# setting (size_divisor=32). While there is little effect on the performance
|
167 |
+
# whether we use the default setting or use size_divisor=1.
|
168 |
+
test_pipeline = [
|
169 |
+
dict(type='LoadImageFromFile'),
|
170 |
+
dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
|
171 |
+
dict(type='MultiScaleFlipAug',
|
172 |
+
img_scale=(1333, 800),
|
173 |
+
flip=False,
|
174 |
+
transforms=[
|
175 |
+
dict(type='Resize', keep_ratio=True),
|
176 |
+
dict(type='RandomFlip'),
|
177 |
+
dict(type='Normalize', **img_norm_cfg),
|
178 |
+
dict(type='Pad', size_divisor=1),
|
179 |
+
dict(type='ImageToTensor', keys=['img']),
|
180 |
+
dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
|
181 |
+
dict(type='ToDataContainer',
|
182 |
+
fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
|
183 |
+
dict(type='Collect', keys=['img']),
|
184 |
+
])
|
185 |
+
]
|
186 |
+
|
187 |
+
evaluation = dict(
|
188 |
+
interval=1,
|
189 |
+
metric='sgdet',
|
190 |
+
relation_mode=True,
|
191 |
+
classwise=True,
|
192 |
+
iou_thrs=0.5,
|
193 |
+
detection_method='pan_seg',
|
194 |
+
)
|
195 |
+
|
196 |
+
data = dict(samples_per_gpu=1,
|
197 |
+
workers_per_gpu=2,
|
198 |
+
train=dict(pipeline=train_pipeline),
|
199 |
+
val=dict(pipeline=test_pipeline),
|
200 |
+
test=dict(pipeline=test_pipeline))
|
201 |
+
# optimizer
|
202 |
+
optimizer = dict(
|
203 |
+
type='AdamW',
|
204 |
+
lr=0.001,
|
205 |
+
weight_decay=0.0001,
|
206 |
+
paramwise_cfg=dict(
|
207 |
+
custom_keys={
|
208 |
+
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
|
209 |
+
'transformer.encoder': dict(lr_mult=0.1, decay_mult=1.0),
|
210 |
+
'transformer.decoder1': dict(lr_mult=0.1, decay_mult=1.0),
|
211 |
+
'obj_query_embed': dict(lr_mult=0.1, decay_mult=1.0),
|
212 |
+
'input_proj': dict(lr_mult=0.1, decay_mult=1.0),
|
213 |
+
'class_embed': dict(lr_mult=0.1, decay_mult=1.0),
|
214 |
+
'box_embed': dict(lr_mult=0.1, decay_mult=1.0),
|
215 |
+
'bbox_attention': dict(lr_mult=0.1, decay_mult=1.0),
|
216 |
+
'mask_head': dict(lr_mult=0.1, decay_mult=1.0),
|
217 |
+
}))
|
218 |
+
|
219 |
+
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
|
220 |
+
|
221 |
+
# learning policy
|
222 |
+
lr_config = dict(policy='step', step=40)
|
223 |
+
runner = dict(type='EpochBasedRunner', max_epochs=60)
|
224 |
+
|
225 |
+
project_name = 'psgformer'
|
226 |
+
expt_name = 'psgformer_r50_psg'
|
227 |
+
work_dir = f'./work_dirs/{expt_name}'
|
228 |
+
checkpoint_config = dict(interval=1, max_keep_ckpts=15)
|
229 |
+
|
230 |
+
log_config = dict(
|
231 |
+
interval=50,
|
232 |
+
hooks=[
|
233 |
+
dict(type='TextLoggerHook'),
|
234 |
+
dict(
|
235 |
+
type='WandbLoggerHook',
|
236 |
+
init_kwargs=dict(
|
237 |
+
project=project_name,
|
238 |
+
name=expt_name,
|
239 |
+
),
|
240 |
+
)
|
241 |
+
],
|
242 |
+
)
|
243 |
+
|
244 |
+
load_from = './work_dirs/checkpoints/detr4psgformer_r50.pth'
|
OpenPSG/configs/psgformer/psgformer_r50_psg_inference.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'./psgformer_r50_psg.py'
|
3 |
+
]
|
4 |
+
|
5 |
+
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
|
6 |
+
std=[58.395, 57.12, 57.375],
|
7 |
+
to_rgb=True)
|
8 |
+
pipeline = [
|
9 |
+
dict(type='LoadImageFromFile'),
|
10 |
+
dict(
|
11 |
+
type='MultiScaleFlipAug',
|
12 |
+
img_scale=(1333, 800),
|
13 |
+
flip=False,
|
14 |
+
transforms=[
|
15 |
+
dict(type='Resize', keep_ratio=True),
|
16 |
+
dict(type='RandomFlip'),
|
17 |
+
dict(type='Normalize', **img_norm_cfg),
|
18 |
+
dict(type='Pad', size_divisor=32),
|
19 |
+
# NOTE: Do not change the img to DC.
|
20 |
+
dict(type='ImageToTensor', keys=['img']),
|
21 |
+
dict(type='Collect', keys=['img']),
|
22 |
+
|
23 |
+
],
|
24 |
+
),
|
25 |
+
]
|
26 |
+
|
27 |
+
data = dict(
|
28 |
+
test=dict(
|
29 |
+
pipeline=pipeline,
|
30 |
+
),
|
31 |
+
)
|
OpenPSG/configs/psgtr/psgtr_r101_psg.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../_base_/models/psgtr_r101.py', '../_base_/datasets/psg.py',
|
3 |
+
'../_base_/custom_runtime.py'
|
4 |
+
]
|
5 |
+
|
6 |
+
custom_imports = dict(imports=[
|
7 |
+
'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses',
|
8 |
+
'openpsg.models.relation_heads.psgtr_head', 'openpsg.datasets',
|
9 |
+
'openpsg.datasets.pipelines.loading',
|
10 |
+
'openpsg.datasets.pipelines.rel_randomcrop',
|
11 |
+
'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils'
|
12 |
+
],
|
13 |
+
allow_failed_imports=False)
|
14 |
+
|
15 |
+
dataset_type = 'PanopticSceneGraphDataset'
|
16 |
+
|
17 |
+
# HACK:
|
18 |
+
object_classes = [
|
19 |
+
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
|
20 |
+
'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
|
21 |
+
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
|
22 |
+
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
|
23 |
+
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
|
24 |
+
'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
|
25 |
+
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
|
26 |
+
'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
|
27 |
+
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
|
28 |
+
'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
|
29 |
+
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
|
30 |
+
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
|
31 |
+
'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
|
32 |
+
'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
|
33 |
+
'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
|
34 |
+
'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
|
35 |
+
'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
|
36 |
+
'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
|
37 |
+
'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
|
38 |
+
'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
|
39 |
+
'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
|
40 |
+
'food-other-merged', 'building-other-merged', 'rock-merged',
|
41 |
+
'wall-other-merged', 'rug-merged'
|
42 |
+
]
|
43 |
+
|
44 |
+
predicate_classes = [
|
45 |
+
'over',
|
46 |
+
'in front of',
|
47 |
+
'beside',
|
48 |
+
'on',
|
49 |
+
'in',
|
50 |
+
'attached to',
|
51 |
+
'hanging from',
|
52 |
+
'on back of',
|
53 |
+
'falling off',
|
54 |
+
'going down',
|
55 |
+
'painted on',
|
56 |
+
'walking on',
|
57 |
+
'running on',
|
58 |
+
'crossing',
|
59 |
+
'standing on',
|
60 |
+
'lying on',
|
61 |
+
'sitting on',
|
62 |
+
'flying over',
|
63 |
+
'jumping over',
|
64 |
+
'jumping from',
|
65 |
+
'wearing',
|
66 |
+
'holding',
|
67 |
+
'carrying',
|
68 |
+
'looking at',
|
69 |
+
'guiding',
|
70 |
+
'kissing',
|
71 |
+
'eating',
|
72 |
+
'drinking',
|
73 |
+
'feeding',
|
74 |
+
'biting',
|
75 |
+
'catching',
|
76 |
+
'picking',
|
77 |
+
'playing with',
|
78 |
+
'chasing',
|
79 |
+
'climbing',
|
80 |
+
'cleaning',
|
81 |
+
'playing',
|
82 |
+
'touching',
|
83 |
+
'pushing',
|
84 |
+
'pulling',
|
85 |
+
'opening',
|
86 |
+
'cooking',
|
87 |
+
'talking to',
|
88 |
+
'throwing',
|
89 |
+
'slicing',
|
90 |
+
'driving',
|
91 |
+
'riding',
|
92 |
+
'parked on',
|
93 |
+
'driving on',
|
94 |
+
'about to hit',
|
95 |
+
'kicking',
|
96 |
+
'swinging',
|
97 |
+
'entering',
|
98 |
+
'exiting',
|
99 |
+
'enclosing',
|
100 |
+
'leaning on',
|
101 |
+
]
|
102 |
+
|
103 |
+
model = dict(bbox_head=dict(
|
104 |
+
num_classes=len(object_classes),
|
105 |
+
num_relations=len(predicate_classes),
|
106 |
+
object_classes=object_classes,
|
107 |
+
predicate_classes=predicate_classes,
|
108 |
+
use_mask=True,
|
109 |
+
num_query=100,
|
110 |
+
), )
|
111 |
+
|
112 |
+
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
|
113 |
+
std=[58.395, 57.12, 57.375],
|
114 |
+
to_rgb=True)
|
115 |
+
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
|
116 |
+
# from the default setting in mmdet.
|
117 |
+
train_pipeline = [
|
118 |
+
dict(type='LoadImageFromFile'),
|
119 |
+
dict(type='LoadPanopticSceneGraphAnnotations',
|
120 |
+
with_bbox=True,
|
121 |
+
with_rel=True,
|
122 |
+
with_mask=True,
|
123 |
+
with_seg=True),
|
124 |
+
dict(type='RandomFlip', flip_ratio=0.5),
|
125 |
+
dict(
|
126 |
+
type='AutoAugment',
|
127 |
+
policies=[
|
128 |
+
[
|
129 |
+
dict(type='Resize',
|
130 |
+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
|
131 |
+
(576, 1333), (608, 1333), (640, 1333),
|
132 |
+
(672, 1333), (704, 1333), (736, 1333),
|
133 |
+
(768, 1333), (800, 1333)],
|
134 |
+
multiscale_mode='value',
|
135 |
+
keep_ratio=True)
|
136 |
+
],
|
137 |
+
[
|
138 |
+
dict(type='Resize',
|
139 |
+
img_scale=[(400, 1333), (500, 1333), (600, 1333)],
|
140 |
+
multiscale_mode='value',
|
141 |
+
keep_ratio=True),
|
142 |
+
dict(type='RelRandomCrop',
|
143 |
+
crop_type='absolute_range',
|
144 |
+
crop_size=(384, 600),
|
145 |
+
allow_negative_crop=False), # no empty relations
|
146 |
+
dict(type='Resize',
|
147 |
+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
|
148 |
+
(576, 1333), (608, 1333), (640, 1333),
|
149 |
+
(672, 1333), (704, 1333), (736, 1333),
|
150 |
+
(768, 1333), (800, 1333)],
|
151 |
+
multiscale_mode='value',
|
152 |
+
override=True,
|
153 |
+
keep_ratio=True)
|
154 |
+
]
|
155 |
+
]),
|
156 |
+
dict(type='Normalize', **img_norm_cfg),
|
157 |
+
dict(type='Pad', size_divisor=1),
|
158 |
+
dict(type='RelsFormatBundle'),
|
159 |
+
dict(type='Collect',
|
160 |
+
keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks'])
|
161 |
+
]
|
162 |
+
# test_pipeline, NOTE the Pad's size_divisor is different from the default
|
163 |
+
# setting (size_divisor=32). While there is little effect on the performance
|
164 |
+
# whether we use the default setting or use size_divisor=1.
|
165 |
+
test_pipeline = [
|
166 |
+
dict(type='LoadImageFromFile'),
|
167 |
+
# dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
|
168 |
+
dict(
|
169 |
+
type='MultiScaleFlipAug',
|
170 |
+
img_scale=(1333, 800),
|
171 |
+
flip=False,
|
172 |
+
transforms=[
|
173 |
+
dict(type='Resize', keep_ratio=True),
|
174 |
+
dict(type='RandomFlip'),
|
175 |
+
dict(type='Normalize', **img_norm_cfg),
|
176 |
+
dict(type='Pad', size_divisor=1),
|
177 |
+
dict(type='ImageToTensor', keys=['img']),
|
178 |
+
# dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
|
179 |
+
# dict(type='ToDataContainer', fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
|
180 |
+
dict(type='Collect', keys=['img']),
|
181 |
+
])
|
182 |
+
]
|
183 |
+
|
184 |
+
evaluation = dict(
|
185 |
+
interval=1,
|
186 |
+
metric='sgdet',
|
187 |
+
relation_mode=True,
|
188 |
+
classwise=True,
|
189 |
+
iou_thrs=0.5,
|
190 |
+
detection_method='pan_seg',
|
191 |
+
)
|
192 |
+
|
193 |
+
data = dict(samples_per_gpu=1,
|
194 |
+
workers_per_gpu=2,
|
195 |
+
train=dict(pipeline=train_pipeline),
|
196 |
+
val=dict(pipeline=test_pipeline),
|
197 |
+
test=dict(pipeline=test_pipeline))
|
198 |
+
# optimizer
|
199 |
+
optimizer = dict(
|
200 |
+
type='AdamW',
|
201 |
+
lr=0.0001,
|
202 |
+
weight_decay=0.0001,
|
203 |
+
paramwise_cfg=dict(custom_keys={
|
204 |
+
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
|
205 |
+
}))
|
206 |
+
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
|
207 |
+
|
208 |
+
# learning policy
|
209 |
+
lr_config = dict(policy='step', step=40)
|
210 |
+
runner = dict(type='EpochBasedRunner', max_epochs=60)
|
211 |
+
|
212 |
+
project_name = 'psgtr'
|
213 |
+
expt_name = 'psgtr_r101_psg'
|
214 |
+
work_dir = f'./work_dirs/{expt_name}'
|
215 |
+
checkpoint_config = dict(interval=2, max_keep_ckpts=10)
|
216 |
+
|
217 |
+
log_config = dict(
|
218 |
+
interval=50,
|
219 |
+
hooks=[
|
220 |
+
dict(type='TextLoggerHook'),
|
221 |
+
dict(
|
222 |
+
type='WandbLoggerHook',
|
223 |
+
init_kwargs=dict(
|
224 |
+
project=project_name,
|
225 |
+
name=expt_name,
|
226 |
+
),
|
227 |
+
)
|
228 |
+
],
|
229 |
+
)
|
230 |
+
|
231 |
+
load_from = 'work_dirs/checkpoints/detr_pan_r101.pth'
|
OpenPSG/configs/psgtr/psgtr_r50.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='PSGTr',
|
3 |
+
backbone=dict(type='ResNet',
|
4 |
+
depth=50,
|
5 |
+
num_stages=4,
|
6 |
+
out_indices=(0, 1, 2, 3),
|
7 |
+
frozen_stages=1,
|
8 |
+
norm_cfg=dict(type='BN', requires_grad=False),
|
9 |
+
norm_eval=True,
|
10 |
+
style='pytorch',
|
11 |
+
init_cfg=dict(type='Pretrained',
|
12 |
+
checkpoint='torchvision://resnet50')),
|
13 |
+
bbox_head=dict(type='PSGTrHead',
|
14 |
+
num_classes=80,
|
15 |
+
num_relations=117,
|
16 |
+
in_channels=2048,
|
17 |
+
transformer=dict(
|
18 |
+
type='Transformer',
|
19 |
+
encoder=dict(type='DetrTransformerEncoder',
|
20 |
+
num_layers=6,
|
21 |
+
transformerlayers=dict(
|
22 |
+
type='BaseTransformerLayer',
|
23 |
+
attn_cfgs=[
|
24 |
+
dict(type='MultiheadAttention',
|
25 |
+
embed_dims=256,
|
26 |
+
num_heads=8,
|
27 |
+
dropout=0.1)
|
28 |
+
],
|
29 |
+
feedforward_channels=2048,
|
30 |
+
ffn_dropout=0.1,
|
31 |
+
operation_order=('self_attn', 'norm',
|
32 |
+
'ffn', 'norm'))),
|
33 |
+
decoder=dict(
|
34 |
+
type='DetrTransformerDecoder',
|
35 |
+
return_intermediate=True,
|
36 |
+
num_layers=6,
|
37 |
+
transformerlayers=dict(
|
38 |
+
type='DetrTransformerDecoderLayer',
|
39 |
+
attn_cfgs=dict(type='MultiheadAttention',
|
40 |
+
embed_dims=256,
|
41 |
+
num_heads=8,
|
42 |
+
dropout=0.1),
|
43 |
+
feedforward_channels=2048,
|
44 |
+
ffn_dropout=0.1,
|
45 |
+
operation_order=('self_attn', 'norm',
|
46 |
+
'cross_attn', 'norm', 'ffn',
|
47 |
+
'norm')),
|
48 |
+
)),
|
49 |
+
positional_encoding=dict(type='SinePositionalEncoding',
|
50 |
+
num_feats=128,
|
51 |
+
normalize=True),
|
52 |
+
sub_loss_cls=dict(type='CrossEntropyLoss',
|
53 |
+
use_sigmoid=False,
|
54 |
+
loss_weight=1.0,
|
55 |
+
class_weight=1.0),
|
56 |
+
sub_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
|
57 |
+
sub_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
|
58 |
+
sub_focal_loss=dict(type='BCEFocalLoss', loss_weight=2.0),
|
59 |
+
sub_dice_loss=dict(type='psgtrDiceLoss', loss_weight=2.0),
|
60 |
+
obj_loss_cls=dict(type='CrossEntropyLoss',
|
61 |
+
use_sigmoid=False,
|
62 |
+
loss_weight=1.0,
|
63 |
+
class_weight=1.0),
|
64 |
+
obj_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
|
65 |
+
obj_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
|
66 |
+
obj_focal_loss=dict(type='BCEFocalLoss', loss_weight=2.0),
|
67 |
+
obj_dice_loss=dict(type='psgtrDiceLoss', loss_weight=2.0),
|
68 |
+
rel_loss_cls=dict(type='CrossEntropyLoss',
|
69 |
+
use_sigmoid=False,
|
70 |
+
loss_weight=2.0,
|
71 |
+
class_weight=1.0)),
|
72 |
+
# training and testing settings
|
73 |
+
train_cfg=dict(assigner=dict(
|
74 |
+
type='HTriMatcher',
|
75 |
+
s_cls_cost=dict(type='ClassificationCost', weight=1.),
|
76 |
+
s_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
|
77 |
+
s_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
|
78 |
+
o_cls_cost=dict(type='ClassificationCost', weight=1.),
|
79 |
+
o_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
|
80 |
+
o_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
|
81 |
+
r_cls_cost=dict(type='ClassificationCost', weight=2.))),
|
82 |
+
test_cfg=dict(max_per_img=100))
|
OpenPSG/configs/psgtr/psgtr_r50_psg.py
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../_base_/models/psgtr_r50.py', '../_base_/datasets/psg.py',
|
3 |
+
'../_base_/custom_runtime.py'
|
4 |
+
]
|
5 |
+
|
6 |
+
custom_imports = dict(imports=[
|
7 |
+
'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses',
|
8 |
+
'openpsg.models.relation_heads.psgtr_head', 'openpsg.datasets',
|
9 |
+
'openpsg.datasets.pipelines.loading',
|
10 |
+
'openpsg.datasets.pipelines.rel_randomcrop',
|
11 |
+
'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils'
|
12 |
+
],
|
13 |
+
allow_failed_imports=False)
|
14 |
+
|
15 |
+
dataset_type = 'PanopticSceneGraphDataset'
|
16 |
+
|
17 |
+
# HACK:
|
18 |
+
object_classes = [
|
19 |
+
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
|
20 |
+
'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
|
21 |
+
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
|
22 |
+
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
|
23 |
+
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
|
24 |
+
'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
|
25 |
+
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
|
26 |
+
'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
|
27 |
+
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
|
28 |
+
'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
|
29 |
+
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
|
30 |
+
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
|
31 |
+
'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
|
32 |
+
'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
|
33 |
+
'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
|
34 |
+
'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
|
35 |
+
'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
|
36 |
+
'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
|
37 |
+
'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
|
38 |
+
'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
|
39 |
+
'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
|
40 |
+
'food-other-merged', 'building-other-merged', 'rock-merged',
|
41 |
+
'wall-other-merged', 'rug-merged'
|
42 |
+
]
|
43 |
+
|
44 |
+
predicate_classes = [
|
45 |
+
'over',
|
46 |
+
'in front of',
|
47 |
+
'beside',
|
48 |
+
'on',
|
49 |
+
'in',
|
50 |
+
'attached to',
|
51 |
+
'hanging from',
|
52 |
+
'on back of',
|
53 |
+
'falling off',
|
54 |
+
'going down',
|
55 |
+
'painted on',
|
56 |
+
'walking on',
|
57 |
+
'running on',
|
58 |
+
'crossing',
|
59 |
+
'standing on',
|
60 |
+
'lying on',
|
61 |
+
'sitting on',
|
62 |
+
'flying over',
|
63 |
+
'jumping over',
|
64 |
+
'jumping from',
|
65 |
+
'wearing',
|
66 |
+
'holding',
|
67 |
+
'carrying',
|
68 |
+
'looking at',
|
69 |
+
'guiding',
|
70 |
+
'kissing',
|
71 |
+
'eating',
|
72 |
+
'drinking',
|
73 |
+
'feeding',
|
74 |
+
'biting',
|
75 |
+
'catching',
|
76 |
+
'picking',
|
77 |
+
'playing with',
|
78 |
+
'chasing',
|
79 |
+
'climbing',
|
80 |
+
'cleaning',
|
81 |
+
'playing',
|
82 |
+
'touching',
|
83 |
+
'pushing',
|
84 |
+
'pulling',
|
85 |
+
'opening',
|
86 |
+
'cooking',
|
87 |
+
'talking to',
|
88 |
+
'throwing',
|
89 |
+
'slicing',
|
90 |
+
'driving',
|
91 |
+
'riding',
|
92 |
+
'parked on',
|
93 |
+
'driving on',
|
94 |
+
'about to hit',
|
95 |
+
'kicking',
|
96 |
+
'swinging',
|
97 |
+
'entering',
|
98 |
+
'exiting',
|
99 |
+
'enclosing',
|
100 |
+
'leaning on',
|
101 |
+
]
|
102 |
+
|
103 |
+
model = dict(bbox_head=dict(
|
104 |
+
num_classes=len(object_classes),
|
105 |
+
num_relations=len(predicate_classes),
|
106 |
+
object_classes=object_classes,
|
107 |
+
predicate_classes=predicate_classes,
|
108 |
+
use_mask=True,
|
109 |
+
num_query=100,
|
110 |
+
), )
|
111 |
+
|
112 |
+
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
|
113 |
+
std=[58.395, 57.12, 57.375],
|
114 |
+
to_rgb=True)
|
115 |
+
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
|
116 |
+
# from the default setting in mmdet.
|
117 |
+
train_pipeline = [
|
118 |
+
dict(type='LoadImageFromFile'),
|
119 |
+
dict(type='LoadPanopticSceneGraphAnnotations',
|
120 |
+
with_bbox=True,
|
121 |
+
with_rel=True,
|
122 |
+
with_mask=True,
|
123 |
+
with_seg=True),
|
124 |
+
dict(type='RandomFlip', flip_ratio=0.5),
|
125 |
+
dict(
|
126 |
+
type='AutoAugment',
|
127 |
+
policies=[
|
128 |
+
[
|
129 |
+
dict(type='Resize',
|
130 |
+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
|
131 |
+
(576, 1333), (608, 1333), (640, 1333),
|
132 |
+
(672, 1333), (704, 1333), (736, 1333),
|
133 |
+
(768, 1333), (800, 1333)],
|
134 |
+
multiscale_mode='value',
|
135 |
+
keep_ratio=True)
|
136 |
+
],
|
137 |
+
[
|
138 |
+
dict(type='Resize',
|
139 |
+
img_scale=[(400, 1333), (500, 1333), (600, 1333)],
|
140 |
+
multiscale_mode='value',
|
141 |
+
keep_ratio=True),
|
142 |
+
dict(type='RelRandomCrop',
|
143 |
+
crop_type='absolute_range',
|
144 |
+
crop_size=(384, 600),
|
145 |
+
allow_negative_crop=False), # no empty relations
|
146 |
+
dict(type='Resize',
|
147 |
+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
|
148 |
+
(576, 1333), (608, 1333), (640, 1333),
|
149 |
+
(672, 1333), (704, 1333), (736, 1333),
|
150 |
+
(768, 1333), (800, 1333)],
|
151 |
+
multiscale_mode='value',
|
152 |
+
override=True,
|
153 |
+
keep_ratio=True)
|
154 |
+
]
|
155 |
+
]),
|
156 |
+
dict(type='Normalize', **img_norm_cfg),
|
157 |
+
dict(type='Pad', size_divisor=1),
|
158 |
+
dict(type='RelsFormatBundle'),
|
159 |
+
dict(type='Collect',
|
160 |
+
keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks'])
|
161 |
+
]
|
162 |
+
# test_pipeline, NOTE the Pad's size_divisor is different from the default
|
163 |
+
# setting (size_divisor=32). While there is little effect on the performance
|
164 |
+
# whether we use the default setting or use size_divisor=1.
|
165 |
+
test_pipeline = [
|
166 |
+
dict(type='LoadImageFromFile'),
|
167 |
+
# dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
|
168 |
+
dict(
|
169 |
+
type='MultiScaleFlipAug',
|
170 |
+
img_scale=(1333, 800),
|
171 |
+
flip=False,
|
172 |
+
transforms=[
|
173 |
+
dict(type='Resize', keep_ratio=True),
|
174 |
+
dict(type='RandomFlip'),
|
175 |
+
dict(type='Normalize', **img_norm_cfg),
|
176 |
+
dict(type='Pad', size_divisor=1),
|
177 |
+
dict(type='ImageToTensor', keys=['img']),
|
178 |
+
# dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
|
179 |
+
# dict(type='ToDataContainer', fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
|
180 |
+
dict(type='Collect', keys=['img']),
|
181 |
+
])
|
182 |
+
]
|
183 |
+
|
184 |
+
evaluation = dict(
|
185 |
+
interval=1,
|
186 |
+
metric='sgdet',
|
187 |
+
relation_mode=True,
|
188 |
+
classwise=True,
|
189 |
+
iou_thrs=0.5,
|
190 |
+
detection_method='pan_seg',
|
191 |
+
)
|
192 |
+
|
193 |
+
data = dict(samples_per_gpu=1,
|
194 |
+
workers_per_gpu=2,
|
195 |
+
train=dict(pipeline=train_pipeline),
|
196 |
+
val=dict(pipeline=test_pipeline),
|
197 |
+
test=dict(pipeline=test_pipeline))
|
198 |
+
# optimizer
|
199 |
+
optimizer = dict(
|
200 |
+
type='AdamW',
|
201 |
+
lr=0.0001,
|
202 |
+
weight_decay=0.0001,
|
203 |
+
paramwise_cfg=dict(custom_keys={
|
204 |
+
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
|
205 |
+
}))
|
206 |
+
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
|
207 |
+
|
208 |
+
# learning policy
|
209 |
+
lr_config = dict(policy='step', step=40)
|
210 |
+
runner = dict(type='EpochBasedRunner', max_epochs=60)
|
211 |
+
|
212 |
+
project_name = 'psgformer'
|
213 |
+
expt_name = 'psgtr_r50_psg_0.5_scale_mask'
|
214 |
+
work_dir = f'./work_dirs/{expt_name}'
|
215 |
+
checkpoint_config = dict(interval=2, max_keep_ckpts=10)
|
216 |
+
|
217 |
+
log_config = dict(
|
218 |
+
interval=50,
|
219 |
+
hooks=[
|
220 |
+
dict(type='TextLoggerHook'),
|
221 |
+
# dict(type='TensorboardLoggerHook'),
|
222 |
+
dict(
|
223 |
+
type='WandbLoggerHook',
|
224 |
+
init_kwargs=dict(
|
225 |
+
project=project_name,
|
226 |
+
name=expt_name,
|
227 |
+
# config=work_dir + "/cfg.yaml"
|
228 |
+
),
|
229 |
+
)
|
230 |
+
],
|
231 |
+
)
|
232 |
+
|
233 |
+
load_from = 'work_dirs/checkpoints/detr_pan_r50.pth'
|
OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'./psgtr_r50_psg.py'
|
3 |
+
]
|
4 |
+
|
5 |
+
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
|
6 |
+
std=[58.395, 57.12, 57.375],
|
7 |
+
to_rgb=True)
|
8 |
+
pipeline = [
|
9 |
+
dict(type='LoadImageFromFile'),
|
10 |
+
dict(
|
11 |
+
type='MultiScaleFlipAug',
|
12 |
+
img_scale=(1333, 800),
|
13 |
+
flip=False,
|
14 |
+
transforms=[
|
15 |
+
dict(type='Resize', keep_ratio=True),
|
16 |
+
dict(type='RandomFlip'),
|
17 |
+
dict(type='Normalize', **img_norm_cfg),
|
18 |
+
dict(type='Pad', size_divisor=32),
|
19 |
+
# NOTE: Do not change the img to DC.
|
20 |
+
dict(type='ImageToTensor', keys=['img']),
|
21 |
+
dict(type='Collect', keys=['img']),
|
22 |
+
|
23 |
+
],
|
24 |
+
),
|
25 |
+
]
|
26 |
+
|
27 |
+
data = dict(
|
28 |
+
test=dict(
|
29 |
+
pipeline=pipeline,
|
30 |
+
),
|
31 |
+
)
|
OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_predcls_psg.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
|
2 |
+
|
3 |
+
model = dict(backbone=dict(
|
4 |
+
depth=101,
|
5 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
|
6 |
+
|
7 |
+
# Log config
|
8 |
+
project_name = 'openpsg'
|
9 |
+
expt_name = 'vctree_panoptic_fpn_r101_fpn_1x_predcls_psg'
|
10 |
+
work_dir = f'./work_dirs/{expt_name}'
|
11 |
+
|
12 |
+
log_config = dict(
|
13 |
+
interval=50,
|
14 |
+
hooks=[
|
15 |
+
dict(type='TextLoggerHook'),
|
16 |
+
# dict(type='TensorboardLoggerHook')
|
17 |
+
dict(
|
18 |
+
type='WandbLoggerHook',
|
19 |
+
init_kwargs=dict(
|
20 |
+
project=project_name,
|
21 |
+
name=expt_name,
|
22 |
+
# config=work_dir + "/cfg.yaml"
|
23 |
+
),
|
24 |
+
),
|
25 |
+
],
|
26 |
+
)
|
27 |
+
|
28 |
+
load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
|
OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
|
2 |
+
|
3 |
+
model = dict(backbone=dict(
|
4 |
+
depth=101,
|
5 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
|
6 |
+
|
7 |
+
# Log config
|
8 |
+
project_name = 'openpsg'
|
9 |
+
expt_name = 'vctree_panoptic_fpn_r101_fpn_1x_sgdet_psg'
|
10 |
+
work_dir = f'./work_dirs/{expt_name}'
|
11 |
+
|
12 |
+
log_config = dict(
|
13 |
+
interval=50,
|
14 |
+
hooks=[
|
15 |
+
dict(type='TextLoggerHook'),
|
16 |
+
# dict(type='TensorboardLoggerHook')
|
17 |
+
dict(
|
18 |
+
type='WandbLoggerHook',
|
19 |
+
init_kwargs=dict(
|
20 |
+
project=project_name,
|
21 |
+
name=expt_name,
|
22 |
+
# config=work_dir + "/cfg.yaml"
|
23 |
+
),
|
24 |
+
),
|
25 |
+
],
|
26 |
+
)
|
27 |
+
|
28 |
+
load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
|
OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_predcls_psg.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
|
3 |
+
]
|
4 |
+
|
5 |
+
model = dict(relation_head=dict(
|
6 |
+
type='VCTreeHead',
|
7 |
+
head_config=dict(
|
8 |
+
# NOTE: Evaluation type
|
9 |
+
use_gt_box=True,
|
10 |
+
use_gt_label=True,
|
11 |
+
),
|
12 |
+
))
|
13 |
+
|
14 |
+
evaluation = dict(interval=1,
|
15 |
+
metric='predcls',
|
16 |
+
relation_mode=True,
|
17 |
+
classwise=True)
|
18 |
+
|
19 |
+
# Change batch size and learning rate
|
20 |
+
data = dict(samples_per_gpu=16,
|
21 |
+
workers_per_gpu=0) # FIXME: Is this the problem?
|
22 |
+
# optimizer = dict(lr=0.001)
|
23 |
+
|
24 |
+
# Log config
|
25 |
+
project_name = 'openpsg'
|
26 |
+
expt_name = 'vctree_panoptic_fpn_r50_fpn_1x_predcls_psg'
|
27 |
+
work_dir = f'./work_dirs/{expt_name}'
|
28 |
+
|
29 |
+
log_config = dict(
|
30 |
+
interval=50,
|
31 |
+
hooks=[
|
32 |
+
dict(type='TextLoggerHook'),
|
33 |
+
# dict(type='TensorboardLoggerHook')
|
34 |
+
dict(
|
35 |
+
type='WandbLoggerHook',
|
36 |
+
init_kwargs=dict(
|
37 |
+
project=project_name,
|
38 |
+
name=expt_name,
|
39 |
+
# config=work_dir + "/cfg.yaml"
|
40 |
+
),
|
41 |
+
),
|
42 |
+
],
|
43 |
+
)
|
OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
|
3 |
+
]
|
4 |
+
|
5 |
+
model = dict(
|
6 |
+
relation_head=dict(
|
7 |
+
type='VCTreeHead',
|
8 |
+
head_config=dict(
|
9 |
+
# NOTE: Evaluation type
|
10 |
+
use_gt_box=False,
|
11 |
+
use_gt_label=False,
|
12 |
+
),
|
13 |
+
),
|
14 |
+
roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ),
|
15 |
+
)
|
16 |
+
|
17 |
+
evaluation = dict(interval=1,
|
18 |
+
metric='sgdet',
|
19 |
+
relation_mode=True,
|
20 |
+
classwise=True,
|
21 |
+
iou_thrs=0.5,
|
22 |
+
detection_method='pan_seg')
|
23 |
+
|
24 |
+
# Change batch size and learning rate
|
25 |
+
data = dict(samples_per_gpu=16,
|
26 |
+
# workers_per_gpu=2
|
27 |
+
)
|
28 |
+
# optimizer = dict(lr=0.003)
|
29 |
+
|
30 |
+
# Log config
|
31 |
+
project_name = 'openpsg'
|
32 |
+
expt_name = 'vctree_panoptic_fpn_r50_fpn_1x_sgdet_psg'
|
33 |
+
work_dir = f'./work_dirs/{expt_name}'
|
34 |
+
|
35 |
+
log_config = dict(
|
36 |
+
interval=50,
|
37 |
+
hooks=[
|
38 |
+
dict(type='TextLoggerHook'),
|
39 |
+
# dict(type='TensorboardLoggerHook')
|
40 |
+
dict(
|
41 |
+
type='WandbLoggerHook',
|
42 |
+
init_kwargs=dict(
|
43 |
+
project=project_name,
|
44 |
+
name=expt_name,
|
45 |
+
# config=work_dir + "/cfg.yaml"
|
46 |
+
),
|
47 |
+
),
|
48 |
+
],
|
49 |
+
)
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
title: OpenPSG
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.1.4
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
title: OpenPSG
|
3 |
+
emoji: πΌοΈποΈππ
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: blue
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.1.4
|
8 |
app_file: app.py
|
app.py
CHANGED
@@ -1,15 +1,135 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
[0.393, 0.769, 0.189],
|
7 |
-
[0.349, 0.686, 0.168],
|
8 |
-
[0.272, 0.534, 0.131]
|
9 |
-
])
|
10 |
-
sepia_img = input_img.dot(sepia_filter.T)
|
11 |
-
sepia_img /= sepia_img.max()
|
12 |
-
return sepia_img
|
13 |
-
|
14 |
-
demo = gr.Interface(sepia, gr.Image(shape=(200, 200)), "image")
|
15 |
-
demo.launch(share=True)
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
from __future__ import annotations
|
4 |
+
|
5 |
+
import argparse
|
6 |
+
import os
|
7 |
+
import pathlib
|
8 |
+
import subprocess
|
9 |
+
import tarfile
|
10 |
+
|
11 |
+
if os.getenv('SYSTEM') == 'spaces':
|
12 |
+
import mim
|
13 |
+
|
14 |
+
mim.uninstall('mmcv-full', confirm_yes=True)
|
15 |
+
mim.install('mmcv-full==1.5.2', is_yes=True)
|
16 |
+
|
17 |
+
subprocess.call('pip uninstall -y opencv-python'.split())
|
18 |
+
subprocess.call('pip uninstall -y opencv-python-headless'.split())
|
19 |
+
subprocess.call('pip install opencv-python-headless==4.5.5.64'.split())
|
20 |
+
|
21 |
+
import cv2
|
22 |
import gradio as gr
|
23 |
+
import numpy as np
|
24 |
+
|
25 |
+
from mmdet.apis import init_detector, inference_detector
|
26 |
+
from utils import show_result
|
27 |
+
import mmcv
|
28 |
+
from mmcv import Config
|
29 |
+
import os.path as osp
|
30 |
+
|
31 |
+
DESCRIPTION = '''# OpenPSG
|
32 |
+
|
33 |
+
This is an official demo for [OpenPSG](https://github.com/Jingkang50/OpenPSG).
|
34 |
+
<img id="overview" alt="overview" src="https://camo.githubusercontent.com/880346b66831a8212074787ba9a2301b4d700bd8f765ca11e4845ac0ab34c230/68747470733a2f2f6c6976652e737461746963666c69636b722e636f6d2f36353533352f35323139333837393637375f373531613465306237395f6b2e6a7067" />
|
35 |
+
'''
|
36 |
+
FOOTER = '<img id="visitor-badge" src="https://visitor-badge.glitch.me/badge?page_id=c-liangyu.openpsg" alt="visitor badge" />'
|
37 |
+
|
38 |
+
|
39 |
+
def parse_args() -> argparse.Namespace:
|
40 |
+
parser = argparse.ArgumentParser()
|
41 |
+
parser.add_argument('--device', type=str, default='cpu')
|
42 |
+
parser.add_argument('--theme', type=str)
|
43 |
+
parser.add_argument('--share', action='store_true')
|
44 |
+
parser.add_argument('--port', type=int)
|
45 |
+
parser.add_argument('--disable-queue',
|
46 |
+
dest='enable_queue',
|
47 |
+
action='store_false')
|
48 |
+
return parser.parse_args()
|
49 |
+
|
50 |
+
|
51 |
+
def update_input_image(image: np.ndarray) -> dict:
|
52 |
+
if image is None:
|
53 |
+
return gr.Image.update(value=None)
|
54 |
+
scale = 1500 / max(image.shape[:2])
|
55 |
+
if scale < 1:
|
56 |
+
image = cv2.resize(image, None, fx=scale, fy=scale)
|
57 |
+
return gr.Image.update(value=image)
|
58 |
+
|
59 |
+
|
60 |
+
def set_example_image(example: list) -> dict:
|
61 |
+
return gr.Image.update(value=example[0])
|
62 |
+
|
63 |
+
|
64 |
+
def infer(model, input_image, num_rel):
|
65 |
+
result = inference_detector(model, input_image)
|
66 |
+
return show_result(input_image,
|
67 |
+
result,
|
68 |
+
is_one_stage=True,
|
69 |
+
num_rel=num_rel,
|
70 |
+
show=True
|
71 |
+
)
|
72 |
+
|
73 |
+
|
74 |
+
def main():
|
75 |
+
args = parse_args()
|
76 |
+
|
77 |
+
model_ckt ='OpenPSG/checkpoints/epoch_60.pth'
|
78 |
+
cfg = Config.fromfile('OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py')
|
79 |
+
|
80 |
+
model = init_detector(cfg, model_ckt, device=args.device)
|
81 |
+
|
82 |
+
with gr.Blocks(theme=args.theme, css='style.css') as demo:
|
83 |
+
gr.Markdown(DESCRIPTION)
|
84 |
+
|
85 |
+
with gr.Row():
|
86 |
+
with gr.Column():
|
87 |
+
with gr.Row():
|
88 |
+
input_image = gr.Image(label='Input Image', type='numpy')
|
89 |
+
with gr.Group():
|
90 |
+
with gr.Row():
|
91 |
+
num_rel = gr.Slider(
|
92 |
+
5,
|
93 |
+
100,
|
94 |
+
step=5,
|
95 |
+
value=20,
|
96 |
+
label='Number of Relations')
|
97 |
+
with gr.Row():
|
98 |
+
run_button = gr.Button(value='Run')
|
99 |
+
# prediction_results = gr.Variable()
|
100 |
+
with gr.Column():
|
101 |
+
with gr.Row():
|
102 |
+
# visualization = gr.Image(label='Result', type='numpy')
|
103 |
+
result = gr.Gallery(label='Result', type='numpy')
|
104 |
+
|
105 |
+
with gr.Row():
|
106 |
+
paths = sorted(pathlib.Path('images').rglob('*.jpg'))
|
107 |
+
example_images = gr.Dataset(components=[input_image],
|
108 |
+
samples=[[path.as_posix()]
|
109 |
+
for path in paths])
|
110 |
+
|
111 |
+
gr.Markdown(FOOTER)
|
112 |
+
|
113 |
+
input_image.change(fn=update_input_image,
|
114 |
+
inputs=input_image,
|
115 |
+
outputs=input_image)
|
116 |
+
|
117 |
+
run_button.click(fn=infer,
|
118 |
+
inputs=[
|
119 |
+
model, input_image
|
120 |
+
],
|
121 |
+
outputs=result)
|
122 |
+
|
123 |
+
example_images.click(fn=set_example_image,
|
124 |
+
inputs=example_images,
|
125 |
+
outputs=input_image)
|
126 |
+
|
127 |
+
demo.launch(
|
128 |
+
enable_queue=args.enable_queue,
|
129 |
+
server_port=args.port,
|
130 |
+
share=args.share,
|
131 |
+
)
|
132 |
+
|
133 |
|
134 |
+
if __name__ == '__main__':
|
135 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fake_gan.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# another demo
|
2 |
+
# https://huggingface.co/spaces/dalle-mini/dalle-mini/blob/21944e2a8508568387951fc66a30e90f1d58819d/app/gradio/app.py
|
3 |
+
|
4 |
+
# This demo needs to be run from the repo folder.
|
5 |
+
# python demo/fake_gan/run.py
|
6 |
+
import os
|
7 |
+
import random
|
8 |
+
import time
|
9 |
+
|
10 |
+
import gradio as gr
|
11 |
+
|
12 |
+
|
13 |
+
def fake_gan(count, *args):
|
14 |
+
time.sleep(1)
|
15 |
+
images = [
|
16 |
+
random.choice(
|
17 |
+
[
|
18 |
+
"https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=387&q=80",
|
19 |
+
"https://images.unsplash.com/photo-1554151228-14d9def656e4?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=386&q=80",
|
20 |
+
"https://images.unsplash.com/photo-1542909168-82c3e7fdca5c?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxzZWFyY2h8MXx8aHVtYW4lMjBmYWNlfGVufDB8fDB8fA%3D%3D&w=1000&q=80",
|
21 |
+
"https://images.unsplash.com/photo-1546456073-92b9f0a8d413?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=387&q=80",
|
22 |
+
"https://images.unsplash.com/photo-1601412436009-d964bd02edbc?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=464&q=80",
|
23 |
+
]
|
24 |
+
)
|
25 |
+
for _ in range(int(count))
|
26 |
+
]
|
27 |
+
return images
|
28 |
+
|
29 |
+
|
30 |
+
cheetah = os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg")
|
31 |
+
|
32 |
+
demo = gr.Interface(
|
33 |
+
fn=fake_gan,
|
34 |
+
inputs=[
|
35 |
+
gr.Number(label="Generation Count"),
|
36 |
+
gr.Image(label="Initial Image (optional)"),
|
37 |
+
gr.Slider(0, 50, 25, label="TV_scale (for smoothness)"),
|
38 |
+
gr.Slider(0, 50, 25, label="Range_Scale (out of range RBG)"),
|
39 |
+
gr.Number(label="Seed"),
|
40 |
+
gr.Number(label="Respacing"),
|
41 |
+
],
|
42 |
+
outputs=gr.Gallery(label="Generated Images"),
|
43 |
+
title="FD-GAN",
|
44 |
+
description="This is a fake demo of a GAN. In reality, the images are randomly chosen from Unsplash.",
|
45 |
+
examples=[
|
46 |
+
[2, cheetah, 12, None, None, None],
|
47 |
+
[1, cheetah, 2, None, None, None],
|
48 |
+
[4, cheetah, 42, None, None, None],
|
49 |
+
[5, cheetah, 23, None, None, None],
|
50 |
+
[4, cheetah, 11, None, None, None],
|
51 |
+
[3, cheetah, 1, None, None, None],
|
52 |
+
],
|
53 |
+
)
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
demo.launch()
|
images/cooking.jpg
ADDED
images/forrest-gump.jpg
ADDED
images/friends.jpg
ADDED
images/mbappe.jpg
ADDED
images/messi.jpg
ADDED