_base_ = [ '../_base_/datasets/scannet-3d.py', '../_base_/models/groupfree3d.py', '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py' ] # model settings model = dict( backbone=dict( type='PointNet2SASSG', in_channels=3, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((128, 128, 256), (256, 256, 512), (256, 256, 512), (256, 256, 512)), fp_channels=((512, 512), (512, 288)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True)), bbox_head=dict( num_classes=18, num_decoder_layers=12, num_proposal=512, size_cls_agnostic=False, bbox_coder=dict( type='GroupFree3DBBoxCoder', num_sizes=18, num_dir_bins=1, with_rot=False, size_cls_agnostic=False, mean_sizes=[[0.76966727, 0.8116021, 0.92573744], [1.876858, 1.8425595, 1.1931566], [0.61328, 0.6148609, 0.7182701], [1.3955007, 1.5121545, 0.83443564], [0.97949594, 1.0675149, 0.6329687], [0.531663, 0.5955577, 1.7500148], [0.9624706, 0.72462326, 1.1481868], [0.83221924, 1.0490936, 1.6875663], [0.21132214, 0.4206159, 0.5372846], [1.4440073, 1.8970833, 0.26985747], [1.0294262, 1.4040797, 0.87554324], [1.3766412, 0.65521795, 1.6813129], [0.6650819, 0.71111923, 1.298853], [0.41999173, 0.37906948, 1.7513971], [0.59359556, 0.5912492, 0.73919016], [0.50867593, 0.50656086, 0.30136237], [1.1511526, 1.0546296, 0.49706793], [0.47535285, 0.49249494, 0.5802117]]), sampling_objectness_loss=dict( type='mmdet.FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=8.0), objectness_loss=dict( type='mmdet.FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), center_loss=dict( type='mmdet.SmoothL1Loss', beta=0.04, reduction='sum', loss_weight=10.0), dir_class_loss=dict( type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, reduction='sum', loss_weight=10.0 / 9.0), semantic_loss=dict( type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)), test_cfg=dict( sample_mode='kps', nms_thr=0.25, score_thr=0.0, per_class_proposal=True, prediction_stages='last_three')) # dataset settings dataset_type = 'ScanNetDataset' data_root = './data/scannet/' class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin') metainfo = dict(classes=class_names) backend_args = None train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', load_dim=6, use_dim=[0, 1, 2], backend_args=backend_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_mask_3d=True, with_seg_3d=True, backend_args=backend_args), dict(type='GlobalAlignment', rotation_axis=2), dict(type='PointSegClassMapping'), dict(type='PointSample', num_points=50000), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.087266, 0.087266], scale_ratio_range=[1.0, 1.0]), dict( type='Pack3DDetInputs', keys=[ 'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask', 'pts_instance_mask' ]) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', load_dim=6, use_dim=[0, 1, 2], backend_args=backend_args), dict(type='GlobalAlignment', rotation_axis=2), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointSample', num_points=50000), ]), dict(type='Pack3DDetInputs', keys=['points']) ] train_dataloader = dict( batch_size=8, num_workers=4, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type='RepeatDataset', times=5, dataset=dict( type=dataset_type, data_root=data_root, ann_file='scannet_infos_train.pkl', pipeline=train_pipeline, filter_empty_gt=False, metainfo=metainfo, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='Depth', backend_args=backend_args))) val_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file='scannet_infos_val.pkl', pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='Depth', backend_args=backend_args)) test_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file='scannet_infos_val.pkl', pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='Depth', backend_args=backend_args)) val_evaluator = dict(type='IndoorMetric') test_evaluator = val_evaluator # optimizer lr = 0.006 optim_wrapper = dict( type='OptimWrapper', optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), clip_grad=dict(max_norm=0.1, norm_type=2), paramwise_cfg=dict( custom_keys={ 'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0), 'bbox_head.decoder_self_posembeds': dict( lr_mult=0.1, decay_mult=1.0), 'bbox_head.decoder_cross_posembeds': dict( lr_mult=0.1, decay_mult=1.0), 'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0), 'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0) })) # learning rate param_scheduler = [ dict( type='MultiStepLR', begin=0, end=80, by_epoch=True, milestones=[56, 68], gamma=0.1) ] # training schedule for 1x train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=80, val_interval=1) val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') default_hooks = dict( checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=10))