From b4fdead39fdbd6cd1c8352857fdea4af69116c2d Mon Sep 17 00:00:00 2001 From: zltjohn <1163842861@qq.com> Date: Thu, 22 Dec 2022 10:49:44 +0800 Subject: [PATCH 1/2] support detr3d in mmdet3d-dev-1.x (v1.1) --- mmdetection3d | 2 +- ...d_res101_gridmask_cbgs_dev-1.x_test_old.py | 285 +++++++++++ .../config/detr3d_res101_gridmask_dev-1.x.py | 241 +++++++++ ...detr3d_res101_gridmask_dev-1.x_test_old.py | 283 ++++++++++ ...et_final_trainval_cbgs_dev-1.x_test_old.py | 279 ++++++++++ .../configs/detr3d/detr3d_res101_gridmask.py | 225 -------- .../detr3d/detr3d_res101_gridmask_cbgs.py | 228 --------- ...vovnet_gridmask_det_final_trainval_cbgs.py | 227 -------- projects/configs/obj_dgcnn/pillar.py | 295 ----------- projects/configs/obj_dgcnn/voxel.py | 293 ----------- projects/mmdet3d_plugin/__init__.py | 23 +- .../core/bbox/assigners/__init__.py | 3 - .../core/bbox/coders/__init__.py | 3 - .../core/bbox/match_costs/__init__.py | 4 - .../core/bbox/match_costs/match_cost.py | 27 - projects/mmdet3d_plugin/core/bbox/util.py | 53 -- projects/mmdet3d_plugin/datasets/__init__.py | 6 +- .../mmdet3d_plugin/datasets/custom_nus.py | 19 + .../datasets/nuscenes_dataset.py | 76 --- .../datasets/pipelines/__init__.py | 11 - .../datasets/pipelines/transform_3d.py | 306 ----------- .../datasets/transforms/__init__.py | 1 + .../datasets/transforms/transform_3d.py | 218 ++++++++ .../models/backbones/__init__.py | 2 - .../mmdet3d_plugin/models/backbones/vovnet.py | 306 ++++++----- .../models/dense_heads/__init__.py | 3 - .../models/dense_heads/detr3d_head.py | 438 ++++++++-------- .../models/dense_heads/dgcnn3d_head.py | 484 ------------------ .../models/detectors/__init__.py | 4 +- .../mmdet3d_plugin/models/detectors/detr3d.py | 362 +++++++------ .../models/detectors/detr3d_old.py | 40 ++ .../models/detectors/obj_dgcnn.py | 187 ------- .../models/task_modules/__init__.py | 3 + .../task_modules}/hungarian_assigner_3d.py | 98 ++-- .../models/task_modules/match_cost.py | 32 ++ .../task_modules}/nms_free_coder.py | 51 +- .../models/task_modules/util.py | 76 +++ .../mmdet3d_plugin/models/utils/__init__.py | 8 +- projects/mmdet3d_plugin/models/utils/detr.py | 106 ---- .../models/utils/detr3d_transformer.py | 287 ++++++----- .../mmdet3d_plugin/models/utils/dgcnn_attn.py | 96 ---- .../mmdet3d_plugin/models/utils/grid_mask.py | 100 ++-- 42 files changed, 2390 insertions(+), 3401 deletions(-) create mode 100644 projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py create mode 100644 projects/config/detr3d_res101_gridmask_dev-1.x.py create mode 100644 projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py create mode 100644 projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py delete mode 100644 projects/configs/detr3d/detr3d_res101_gridmask.py delete mode 100755 projects/configs/detr3d/detr3d_res101_gridmask_cbgs.py delete mode 100755 projects/configs/detr3d/detr3d_vovnet_gridmask_det_final_trainval_cbgs.py delete mode 100755 projects/configs/obj_dgcnn/pillar.py delete mode 100755 projects/configs/obj_dgcnn/voxel.py mode change 100755 => 100644 projects/mmdet3d_plugin/__init__.py delete mode 100755 projects/mmdet3d_plugin/core/bbox/assigners/__init__.py delete mode 100755 projects/mmdet3d_plugin/core/bbox/coders/__init__.py delete mode 100755 projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py delete mode 100755 projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py delete mode 100755 projects/mmdet3d_plugin/core/bbox/util.py create mode 100644 projects/mmdet3d_plugin/datasets/custom_nus.py delete mode 100644 projects/mmdet3d_plugin/datasets/nuscenes_dataset.py delete mode 100755 projects/mmdet3d_plugin/datasets/pipelines/__init__.py delete mode 100755 projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py create mode 100644 projects/mmdet3d_plugin/datasets/transforms/__init__.py create mode 100644 projects/mmdet3d_plugin/datasets/transforms/transform_3d.py mode change 100755 => 100644 projects/mmdet3d_plugin/models/dense_heads/detr3d_head.py delete mode 100755 projects/mmdet3d_plugin/models/dense_heads/dgcnn3d_head.py mode change 100755 => 100644 projects/mmdet3d_plugin/models/detectors/detr3d.py create mode 100644 projects/mmdet3d_plugin/models/detectors/detr3d_old.py delete mode 100755 projects/mmdet3d_plugin/models/detectors/obj_dgcnn.py create mode 100644 projects/mmdet3d_plugin/models/task_modules/__init__.py rename projects/mmdet3d_plugin/{core/bbox/assigners => models/task_modules}/hungarian_assigner_3d.py (63%) create mode 100755 projects/mmdet3d_plugin/models/task_modules/match_cost.py rename projects/mmdet3d_plugin/{core/bbox/coders => models/task_modules}/nms_free_coder.py (73%) create mode 100755 projects/mmdet3d_plugin/models/task_modules/util.py delete mode 100755 projects/mmdet3d_plugin/models/utils/detr.py delete mode 100755 projects/mmdet3d_plugin/models/utils/dgcnn_attn.py diff --git a/mmdetection3d b/mmdetection3d index 60ce864..9a425a1 160000 --- a/mmdetection3d +++ b/mmdetection3d @@ -1 +1 @@ -Subproject commit 60ce864ff76af4316fb9ae56a2a5b7741bfdd9ab +Subproject commit 9a425a1109d29e808ab8e2d6ddaff2ae004cfa56 diff --git a/projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py b/projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py new file mode 100644 index 0000000..fc7d88f --- /dev/null +++ b/projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py @@ -0,0 +1,285 @@ +_base_ = [ + # '.../mmdetection3d/configs/_base_/datasets/nus-3d.py', + '../../mmdetection3d/configs/_base_/default_runtime.py' +] + +custom_imports = dict(imports=['projects.mmdet3d_plugin']) +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + +img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False) +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict(use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=False) +# this means type='Detr3D' will be processed as 'mmdet3d.Detr3D' +default_scope = 'mmdet3d' +model = dict( + type='Detr3D_old', + use_grid_mask=True, + data_preprocessor=dict(type='Det3DDataPreprocessor', + **img_norm_cfg, + pad_size_divisor=32), + img_backbone=dict(type='mmdet.ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN2d', requires_grad=False), + norm_eval=True, + style='caffe', + dcn=dict(type='DCNv2', + deform_groups=1, + fallback_on_stride=False), + stage_with_dcn=(False, False, True, True)), + img_neck=dict(type='mmdet.FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=4, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='Detr3DHead', + num_query=900, + num_classes=10, + in_channels=256, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( + type='Detr3DTransformer', + decoder=dict( + type='Detr3DTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='mmdet.DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', # mmcv. + embed_dims=256, + num_heads=8, + dropout=0.1), + dict(type='Detr3DCrossAtten', + pc_range=point_cloud_range, + num_points=1, + embed_dims=256) + ], + feedforward_channels=512, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict(type='mmdet.SinePositionalEncoding', + num_feats=128, + normalize=True, + offset=-0.5), + loss_cls=dict(type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25), + loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # ↓ Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='mmdet.IoUCost', weight=0.0), + pc_range=point_cloud_range)))) + +dataset_type = 'NuScenesDataset' +data_root = 'data/nus_v2/' + +test_transforms = [ + dict(type='RandomResize3D', + scale=(1600, 900), + ratio_range=(1., 1.), + keep_ratio=True) +] +train_transforms = test_transforms + +file_client_args = dict(backend='disk') +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + with_attr_label=False), + dict(type='MultiViewWrapper', transforms=train_transforms), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6), + dict(type='MultiViewWrapper', transforms=test_transforms), + dict(type='Pack3DDetInputs', keys=['img']) +] + +metainfo = dict(classes=class_names) +data_prefix = dict(pts='', + CAM_FRONT='samples/CAM_FRONT', + CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', + CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', + CAM_BACK='samples/CAM_BACK', + CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', + CAM_BACK_LEFT='samples/CAM_BACK_LEFT') + +train_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CBGSDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_train.pkl', + pipeline=train_pipeline, + load_type='frame_based', + metainfo=metainfo, + modality=input_modality, + test_mode=False, + data_prefix=data_prefix, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'))) + +val_dataloader = dict(batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_val.pkl', + load_type='frame_based', + pipeline=test_pipeline, + metainfo=metainfo, + modality=input_modality, + test_mode=True, + data_prefix=data_prefix, + box_type_3d='LiDAR')) + +test_dataloader = val_dataloader + +val_evaluator = dict(type='NuScenesMetric', + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + metric='bbox') +test_evaluator = val_evaluator + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01), + paramwise_cfg=dict(custom_keys={'img_backbone': dict(lr_mult=0.1)}), + clip_grad=dict(max_norm=35, norm_type=2), +) + +# learning policy +param_scheduler = [ + dict(type='LinearLR', + start_factor=1.0 / 3, + by_epoch=False, + begin=0, + end=500), + dict(type='CosineAnnealingLR', + by_epoch=True, + begin=0, + end=24, + T_max=24, + eta_min_ratio=1e-3) +] + +total_epochs = 24 + +train_cfg = dict(type='EpochBasedTrainLoop', + max_epochs=total_epochs, + val_interval=2) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +# checkpoint_config = dict(interval=1, max_keep_ckpts=1) +default_hooks = dict(checkpoint=dict( + type='CheckpointHook', interval=1, max_keep_ckpts=1, save_last=True)) +load_from = 'ckpts/fcos3d_yue.pth' + +vis_backends = [dict(type='TensorboardVisBackend')] +visualizer = dict(type='Det3DLocalVisualizer', + vis_backends=vis_backends, + name='visualizer') + +# before fixing h,w bug in feature-sampling +# mAP: 0.3405 +# mATE: 0.7516 +# mASE: 0.2688 +# mAOE: 0.3750 +# mAVE: 0.8621 +# mAAE: 0.2080 +# NDS: 0.4237 +# Eval time: 124.2s + +# Per-class results: +# Object Class AP ATE ASE AOE AVE AAE +# car 0.513 0.590 0.153 0.066 0.932 0.197 +# truck 0.280 0.771 0.206 0.092 1.001 0.237 +# bus 0.356 0.860 0.193 0.131 1.826 0.370 +# trailer 0.179 1.110 0.234 0.564 0.950 0.159 +# construction_vehicle 0.080 0.988 0.444 0.947 0.120 0.330 +# pedestrian 0.397 0.683 0.305 0.532 0.469 0.200 +# motorcycle 0.338 0.712 0.255 0.359 1.194 0.158 +# bicycle 0.256 0.657 0.284 0.583 0.405 0.014 +# traffic_cone 0.508 0.542 0.323 nan nan nan +# barrier 0.501 0.602 0.292 0.100 nan nan + +# after fixing h,w bug in feature-sampling +# mAP: 0.3493 +# mATE: 0.7162 +# mASE: 0.2682 +# mAOE: 0.3795 +# mAVE: 0.8417 +# mAAE: 0.1996 +# NDS: 0.4341 +# Eval time: 128.7s + +# Per-class results: +# Object Class AP ATE ASE AOE AVE AAE +# car 0.542 0.533 0.151 0.064 0.954 0.193 +# truck 0.285 0.774 0.208 0.093 1.016 0.239 +# bus 0.363 0.796 0.192 0.137 1.842 0.379 +# trailer 0.167 1.075 0.236 0.610 0.718 0.094 +# construction_vehicle 0.081 0.970 0.438 0.914 0.114 0.337 +# pedestrian 0.400 0.684 0.306 0.531 0.474 0.201 +# motorcycle 0.337 0.684 0.257 0.383 1.203 0.143 +# bicycle 0.261 0.631 0.280 0.582 0.411 0.012 +# traffic_cone 0.531 0.478 0.324 nan nan nan +# barrier 0.525 0.536 0.291 0.102 nan nan diff --git a/projects/config/detr3d_res101_gridmask_dev-1.x.py b/projects/config/detr3d_res101_gridmask_dev-1.x.py new file mode 100644 index 0000000..2c14d6f --- /dev/null +++ b/projects/config/detr3d_res101_gridmask_dev-1.x.py @@ -0,0 +1,241 @@ +_base_ = [ + # '.../mmdetection3d/configs/_base_/datasets/nus-3d.py', + '../../mmdetection3d/configs/_base_/default_runtime.py' +] + +custom_imports = dict(imports=['projects.mmdet3d_plugin']) +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + +img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False) +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict(use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=False) +# this means type='Detr3D' will be processed as 'mmdet3d.Detr3D' +default_scope = 'mmdet3d' +model = dict( + type='Detr3D', + use_grid_mask=True, + data_preprocessor=dict(type='Det3DDataPreprocessor', + **img_norm_cfg, + pad_size_divisor=32), + img_backbone=dict(type='mmdet.ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN2d', requires_grad=False), + norm_eval=True, + style='caffe', + dcn=dict(type='DCNv2', + deform_groups=1, + fallback_on_stride=False), + stage_with_dcn=(False, False, True, True)), + img_neck=dict(type='mmdet.FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=4, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='Detr3DHead', + num_query=900, + num_classes=10, + in_channels=256, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( + type='Detr3DTransformer', + decoder=dict( + type='Detr3DTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='mmdet.DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', # mmcv. + embed_dims=256, + num_heads=8, + dropout=0.1), + dict(type='Detr3DCrossAtten', + pc_range=point_cloud_range, + num_points=1, + embed_dims=256) + ], + feedforward_channels=512, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict(type='mmdet.SinePositionalEncoding', + num_feats=128, + normalize=True, + offset=-0.5), + loss_cls=dict(type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25), + loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # ↓ Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='mmdet.IoUCost', weight=0.0), + pc_range=point_cloud_range)))) + +dataset_type = 'NuScenesDataset' +data_root = 'data/nus_v2/' + +test_transforms = [ + dict(type='RandomResize3D', + scale=(1600, 900), + ratio_range=(1., 1.), + keep_ratio=True) +] +train_transforms = test_transforms + +file_client_args = dict(backend='disk') +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + with_attr_label=False), + dict(type='MultiViewWrapper', transforms=train_transforms), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6), + dict(type='MultiViewWrapper', transforms=test_transforms), + dict(type='Pack3DDetInputs', keys=['img']) +] + +metainfo = dict(classes=class_names) +data_prefix = dict(pts='', + CAM_FRONT='samples/CAM_FRONT', + CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', + CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', + CAM_BACK='samples/CAM_BACK', + CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', + CAM_BACK_LEFT='samples/CAM_BACK_LEFT') + +train_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_train.pkl', + pipeline=train_pipeline, + load_type='frame_based', + metainfo=metainfo, + modality=input_modality, + test_mode=False, + data_prefix=data_prefix, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR')) + +val_dataloader = dict(batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_val.pkl', + load_type='frame_based', + pipeline=test_pipeline, + metainfo=metainfo, + modality=input_modality, + test_mode=True, + data_prefix=data_prefix, + box_type_3d='LiDAR')) + +test_dataloader = val_dataloader + +val_evaluator = dict(type='NuScenesMetric', + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + metric='bbox') +test_evaluator = val_evaluator + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01), + paramwise_cfg=dict(custom_keys={'img_backbone': dict(lr_mult=0.1)}), + clip_grad=dict(max_norm=35, norm_type=2), +) + +# learning policy +param_scheduler = [ + dict(type='LinearLR', + start_factor=1.0 / 3, + by_epoch=False, + begin=0, + end=500), + dict(type='CosineAnnealingLR', + by_epoch=True, + begin=0, + end=24, + T_max=24, + eta_min_ratio=1e-3) +] + +total_epochs = 24 + +train_cfg = dict(type='EpochBasedTrainLoop', + max_epochs=total_epochs, + val_interval=2) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +# checkpoint_config = dict(interval=1, max_keep_ckpts=1) +default_hooks = dict(checkpoint=dict( + type='CheckpointHook', interval=1, max_keep_ckpts=1, save_last=True)) +load_from = 'ckpts/fcos3d_yue.pth' + +# ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. +# This behaviour is the source of the following dependency conflicts. +# jupyter-packaging 0.12.3 requires setuptools>=60.2.0, but you have setuptools 58.0.4 which is incompatible. +# setuptools 65 downgrades to 58.In mmlab-node we use setuptools 61 but occurs NO errors +vis_backends = [dict(type='TensorboardVisBackend')] +visualizer = dict(type='Det3DLocalVisualizer', + vis_backends=vis_backends, + name='visualizer') diff --git a/projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py b/projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py new file mode 100644 index 0000000..d48b6d0 --- /dev/null +++ b/projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py @@ -0,0 +1,283 @@ +_base_ = [ + # '.../mmdetection3d/configs/_base_/datasets/nus-3d.py', + '../../mmdetection3d/configs/_base_/default_runtime.py' +] + +custom_imports = dict(imports=['projects.mmdet3d_plugin']) +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + +img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False) +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict(use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=False) +# this means type='Detr3D' will be processed as 'mmdet3d.Detr3D' +default_scope = 'mmdet3d' +model = dict( + type='Detr3D_old', + use_grid_mask=True, + data_preprocessor=dict(type='Det3DDataPreprocessor', + **img_norm_cfg, + pad_size_divisor=32), + img_backbone=dict(type='mmdet.ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN2d', requires_grad=False), + norm_eval=True, + style='caffe', + dcn=dict(type='DCNv2', + deform_groups=1, + fallback_on_stride=False), + stage_with_dcn=(False, False, True, True)), + img_neck=dict(type='mmdet.FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=4, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='Detr3DHead', + num_query=900, + num_classes=10, + in_channels=256, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + transformer=dict( + type='Detr3DTransformer', + decoder=dict( + type='Detr3DTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='mmdet.DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', #mmcv. + embed_dims=256, + num_heads=8, + dropout=0.1), + dict(type='Detr3DCrossAtten', + pc_range=point_cloud_range, + num_points=1, + embed_dims=256) + ], + feedforward_channels=512, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict(type='mmdet.SinePositionalEncoding', + num_feats=128, + normalize=True, + offset=-0.5), + loss_cls=dict(type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25), + loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # ↓ Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='mmdet.IoUCost', weight=0.0), + pc_range=point_cloud_range)))) + +dataset_type = 'NuScenesDataset' +data_root = 'data/nus_v2/' + +test_transforms = [ + dict(type='RandomResize3D', + scale=(1600, 900), + ratio_range=(1., 1.), + keep_ratio=True) +] +train_transforms = test_transforms + +file_client_args = dict(backend='disk') +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + with_attr_label=False), + dict(type='MultiViewWrapper', transforms=train_transforms), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6), + dict(type='MultiViewWrapper', transforms=test_transforms), + dict(type='Pack3DDetInputs', keys=['img']) +] + +metainfo = dict(classes=class_names) +data_prefix = dict(pts='', + CAM_FRONT='samples/CAM_FRONT', + CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', + CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', + CAM_BACK='samples/CAM_BACK', + CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', + CAM_BACK_LEFT='samples/CAM_BACK_LEFT') + +train_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_train.pkl', + pipeline=train_pipeline, + load_type='frame_based', + metainfo=metainfo, + modality=input_modality, + test_mode=False, + data_prefix=data_prefix, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR')) + +val_dataloader = dict(batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_val.pkl', + load_type='frame_based', + pipeline=test_pipeline, + metainfo=metainfo, + modality=input_modality, + test_mode=True, + data_prefix=data_prefix, + box_type_3d='LiDAR')) + +test_dataloader = val_dataloader + +val_evaluator = dict(type='NuScenesMetric', + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + metric='bbox') +test_evaluator = val_evaluator + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01), + paramwise_cfg=dict(custom_keys={'img_backbone': dict(lr_mult=0.1)}), + clip_grad=dict(max_norm=35, norm_type=2), +) + +# learning policy +param_scheduler = [ + dict(type='LinearLR', + start_factor=1.0 / 3, + by_epoch=False, + begin=0, + end=500), + dict(type='CosineAnnealingLR', + by_epoch=True, + begin=0, + end=24, + T_max=24, + eta_min_ratio=1e-3) +] + +total_epochs = 24 + +train_cfg = dict(type='EpochBasedTrainLoop', + max_epochs=total_epochs, + val_interval=2) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +# checkpoint_config = dict(interval=1, max_keep_ckpts=1) +default_hooks = dict(checkpoint=dict( + type='CheckpointHook', interval=1, max_keep_ckpts=1, save_last=True)) +load_from = 'ckpts/fcos3d_yue.pth' + +vis_backends = [dict(type='TensorboardVisBackend')] +visualizer = dict(type='Det3DLocalVisualizer', + vis_backends=vis_backends, + name='visualizer') + +# before fixing h,w bug in feature-sampling +# mAP: 0.3450 +# mATE: 0.7740 +# mASE: 0.2675 +# mAOE: 0.3960 +# mAVE: 0.8737 +# mAAE: 0.2156 +# NDS: 0.4198 +# Eval time: 161.5s + +# Per-class results: +# Object Class AP ATE ASE AOE AVE AAE +# car 0.534 0.565 0.152 0.071 0.907 0.214 +# truck 0.285 0.839 0.213 0.114 0.984 0.229 +# bus 0.346 0.924 0.199 0.117 2.060 0.379 +# trailer 0.166 1.108 0.230 0.551 0.734 0.126 +# construction_vehicle 0.082 1.057 0.446 1.013 0.125 0.387 +# pedestrian 0.426 0.688 0.294 0.508 0.459 0.195 +# motorcycle 0.343 0.696 0.260 0.475 1.268 0.180 +# bicycle 0.275 0.691 0.275 0.578 0.452 0.015 +# traffic_cone 0.521 0.555 0.314 nan nan nan +# barrier 0.473 0.619 0.293 0.138 nan nan + +# after fixing h,w bug in feature-sampling +# mAP: 0.3469 +# mATE: 0.7651 +# mASE: 0.2678 +# mAOE: 0.3916 +# mAVE: 0.8758 +# mAAE: 0.2110 +# NDS: 0.4223 +# Eval time: 117.2s + +# Per-class results: +# Object Class AP ATE ASE AOE AVE AAE +# car 0.546 0.544 0.152 0.070 0.911 0.208 +# truck 0.286 0.834 0.212 0.113 1.005 0.231 +# bus 0.346 0.870 0.196 0.116 2.063 0.383 +# trailer 0.167 1.106 0.233 0.549 0.687 0.093 +# construction_vehicle 0.082 1.060 0.449 0.960 0.120 0.384 +# pedestrian 0.424 0.700 0.295 0.512 0.462 0.194 +# motorcycle 0.340 0.709 0.259 0.489 1.288 0.176 +# bicycle 0.278 0.698 0.275 0.586 0.473 0.018 +# traffic_cone 0.529 0.526 0.313 nan nan nan +# barrier 0.471 0.603 0.292 0.131 nan nan diff --git a/projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py b/projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py new file mode 100644 index 0000000..942e719 --- /dev/null +++ b/projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py @@ -0,0 +1,279 @@ +_base_ = [ + # '.../mmdetection3d/configs/_base_/datasets/nus-3d.py', + '../../mmdetection3d/configs/_base_/default_runtime.py' +] + +custom_imports = dict(imports=['projects.mmdet3d_plugin']) +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] + +img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + bgr_to_rgb=False) +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict(use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=False) +# this means type='Detr3D' will be processed as 'mmdet3d.Detr3D' +default_scope = 'mmdet3d' +model = dict( + type='Detr3D_old', + use_grid_mask=True, + data_preprocessor=dict(type='Det3DDataPreprocessor', + **img_norm_cfg, + pad_size_divisor=32), + img_backbone=dict(type='VoVNet', + spec_name='V-99-eSE', + norm_eval=True, + frozen_stages=1, + input_ch=3, + out_features=['stage2', 'stage3', 'stage4', 'stage5']), + img_neck=dict(type='mmdet.FPN', + in_channels=[256, 512, 768, 1024], + out_channels=256, + start_level=0, + add_extra_convs='on_output', + num_outs=4, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='Detr3DHead', + num_query=900, + num_classes=10, + in_channels=256, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + transformer=dict( + type='Detr3DTransformer', + decoder=dict( + type='Detr3DTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='mmdet.DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', # mmcv. + embed_dims=256, + num_heads=8, + dropout=0.1), + dict(type='Detr3DCrossAtten', + pc_range=point_cloud_range, + num_points=1, + embed_dims=256) + ], + feedforward_channels=512, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict(type='mmdet.SinePositionalEncoding', + num_feats=128, + normalize=True, + offset=-0.5), + loss_cls=dict(type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25), + loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + # ↓ Fake cost. This is just to make it compatible with DETR head. + iou_cost=dict(type='mmdet.IoUCost', weight=0.0), + pc_range=point_cloud_range)))) + +dataset_type = 'NuScenesDataset' +data_root = 'data/nus_v2/' + +test_transforms = [ + dict(type='RandomResize3D', + scale=(1600, 900), + ratio_range=(1., 1.), + keep_ratio=True) +] +train_transforms = test_transforms + +file_client_args = dict(backend='disk') +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + with_attr_label=False), + dict(type='MultiViewWrapper', transforms=train_transforms), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6), + dict(type='MultiViewWrapper', transforms=test_transforms), + dict(type='Pack3DDetInputs', keys=['img']) +] + +metainfo = dict(classes=class_names) +data_prefix = dict(pts='', + CAM_FRONT='samples/CAM_FRONT', + CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', + CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', + CAM_BACK='samples/CAM_BACK', + CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', + CAM_BACK_LEFT='samples/CAM_BACK_LEFT') + +train_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CBGSDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_trainval.pkl', + pipeline=train_pipeline, + load_type='frame_based', + metainfo=metainfo, + modality=input_modality, + test_mode=False, + data_prefix=data_prefix, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'))) + +val_dataloader = dict(batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_val.pkl', + load_type='frame_based', + pipeline=test_pipeline, + metainfo=metainfo, + modality=input_modality, + test_mode=True, + data_prefix=data_prefix, + box_type_3d='LiDAR')) + +test_dataloader = val_dataloader + +val_evaluator = dict(type='NuScenesMetric', + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + metric='bbox') +test_evaluator = val_evaluator + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01), + paramwise_cfg=dict(custom_keys={'img_backbone': dict(lr_mult=0.1)}), + clip_grad=dict(max_norm=35, norm_type=2), +) + +# learning policy +param_scheduler = [ + dict(type='LinearLR', + start_factor=1.0 / 3, + by_epoch=False, + begin=0, + end=500), + dict(type='CosineAnnealingLR', + by_epoch=True, + begin=0, + end=24, + T_max=24, + eta_min_ratio=1e-3) +] + +total_epochs = 24 + +train_cfg = dict(type='EpochBasedTrainLoop', + max_epochs=total_epochs, + val_interval=2) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +# checkpoint_config = dict(interval=1, max_keep_ckpts=1) +default_hooks = dict(checkpoint=dict( + type='CheckpointHook', interval=1, max_keep_ckpts=1, save_last=True)) +load_from = 'ckpts/fcos3d_yue.pth' + +vis_backends = [dict(type='TensorboardVisBackend')] +visualizer = dict(type='Det3DLocalVisualizer', + vis_backends=vis_backends, + name='visualizer') + +# before fixing h,w bug in feature-sampling +# mAP: 0.7103 +# mATE: 0.5395 +# mASE: 0.1455 +# mAOE: 0.0719 +# mAVE: 0.2233 +# mAAE: 0.1862 +# NDS: 0.7385 +# Eval time: 107.3s +# Per-class results: +# Object Class AP ATE ASE AOE AVE AAE +# car 0.706 0.569 0.116 0.033 0.261 0.202 +# truck 0.737 0.483 0.120 0.034 0.195 0.208 +# bus 0.760 0.463 0.108 0.028 0.296 0.240 +# trailer 0.739 0.453 0.124 0.042 0.138 0.147 +# construction_vehicle 0.710 0.513 0.178 0.085 0.139 0.329 +# pedestrian 0.715 0.510 0.205 0.203 0.248 0.138 +# motorcycle 0.692 0.560 0.149 0.089 0.357 0.218 +# bicycle 0.673 0.643 0.171 0.081 0.152 0.008 +# traffic_cone 0.691 0.569 0.172 nan nan nan +# barrier 0.681 0.633 0.112 0.052 nan nan + +# after fixing h,w bug in feature-sampling +# mAP: 0.8348 +# mATE: 0.3225 +# mASE: 0.1417 +# mAOE: 0.0676 +# mAVE: 0.2204 +# mAAE: 0.1820 +# NDS: 0.8240 +# Eval time: 97.4s + +# Per-class results: +# Object Class AP ATE ASE AOE AVE AAE +# car 0.873 0.256 0.114 0.033 0.260 0.195 +# truck 0.833 0.327 0.115 0.033 0.191 0.216 +# bus 0.842 0.323 0.104 0.027 0.293 0.244 +# trailer 0.779 0.394 0.116 0.041 0.136 0.123 +# construction_vehicle 0.784 0.406 0.174 0.079 0.137 0.320 +# pedestrian 0.806 0.380 0.203 0.181 0.244 0.135 +# motorcycle 0.822 0.337 0.150 0.085 0.347 0.213 +# bicycle 0.871 0.271 0.169 0.079 0.154 0.009 +# traffic_cone 0.877 0.241 0.162 nan nan nan +# barrier 0.861 0.289 0.110 0.050 nan nan diff --git a/projects/configs/detr3d/detr3d_res101_gridmask.py b/projects/configs/detr3d/detr3d_res101_gridmask.py deleted file mode 100644 index a4ff267..0000000 --- a/projects/configs/detr3d/detr3d_res101_gridmask.py +++ /dev/null @@ -1,225 +0,0 @@ -_base_ = [ - '../../../mmdetection3d/configs/_base_/datasets/nus-3d.py', - '../../../mmdetection3d/configs/_base_/default_runtime.py' -] - -plugin=True -plugin_dir='projects/mmdet3d_plugin/' - -# If point cloud range is changed, the models should also change their point -# cloud range accordingly -point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] -voxel_size = [0.2, 0.2, 8] - -img_norm_cfg = dict( - mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) -# For nuScenes we usually do 10-class detection -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' -] - -input_modality = dict( - use_lidar=False, - use_camera=True, - use_radar=False, - use_map=False, - use_external=False) - -model = dict( - type='Detr3D', - use_grid_mask=True, - img_backbone=dict( - type='ResNet', - depth=101, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN2d', requires_grad=False), - norm_eval=True, - style='caffe', - dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), - stage_with_dcn=(False, False, True, True)), - img_neck=dict( - type='FPN', - in_channels=[256, 512, 1024, 2048], - out_channels=256, - start_level=1, - add_extra_convs='on_output', - num_outs=4, - relu_before_extra_convs=True), - pts_bbox_head=dict( - type='Detr3DHead', - num_query=900, - num_classes=10, - in_channels=256, - sync_cls_avg_factor=True, - with_box_refine=True, - as_two_stage=False, - transformer=dict( - type='Detr3DTransformer', - decoder=dict( - type='Detr3DTransformerDecoder', - num_layers=6, - return_intermediate=True, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=[ - dict( - type='MultiheadAttention', - embed_dims=256, - num_heads=8, - dropout=0.1), - dict( - type='Detr3DCrossAtten', - pc_range=point_cloud_range, - num_points=1, - embed_dims=256) - ], - feedforward_channels=512, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm')))), - bbox_coder=dict( - type='NMSFreeCoder', - post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - pc_range=point_cloud_range, - max_num=300, - voxel_size=voxel_size, - num_classes=10), - positional_encoding=dict( - type='SinePositionalEncoding', - num_feats=128, - normalize=True, - offset=-0.5), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=2.0), - loss_bbox=dict(type='L1Loss', loss_weight=0.25), - loss_iou=dict(type='GIoULoss', loss_weight=0.0)), - # model training and testing settings - train_cfg=dict(pts=dict( - grid_size=[512, 512, 1], - voxel_size=voxel_size, - point_cloud_range=point_cloud_range, - out_size_factor=4, - assigner=dict( - type='HungarianAssigner3D', - cls_cost=dict(type='FocalLossCost', weight=2.0), - reg_cost=dict(type='BBox3DL1Cost', weight=0.25), - iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. - pc_range=point_cloud_range)))) - -dataset_type = 'NuScenesDataset' -data_root = 'data/nuscenes/' - -file_client_args = dict(backend='disk') - -db_sampler = dict( - data_root=data_root, - info_path=data_root + 'nuscenes_dbinfos_train.pkl', - rate=1.0, - prepare=dict( - filter_by_difficulty=[-1], - filter_by_min_points=dict( - car=5, - truck=5, - bus=5, - trailer=5, - construction_vehicle=5, - traffic_cone=5, - barrier=5, - motorcycle=5, - bicycle=5, - pedestrian=5)), - classes=class_names, - sample_groups=dict( - car=2, - truck=3, - construction_vehicle=7, - bus=4, - trailer=6, - barrier=2, - motorcycle=6, - bicycle=6, - pedestrian=2, - traffic_cone=2), - points_loader=dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=[0, 1, 2, 3, 4], - file_client_args=file_client_args)) - -train_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='PhotoMetricDistortionMultiViewImage'), - dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), - dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectNameFilter', classes=class_names), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - dict(type='PadMultiViewImage', size_divisor=32), - dict(type='DefaultFormatBundle3D', class_names=class_names), - dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) -] -test_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - dict(type='PadMultiViewImage', size_divisor=32), - dict( - type='MultiScaleFlipAug3D', - img_scale=(1333, 800), - pts_scale_ratio=1, - flip=False, - transforms=[ - dict( - type='DefaultFormatBundle3D', - class_names=class_names, - with_label=False), - dict(type='Collect3D', keys=['img']) - ]) -] - - -data = dict( - samples_per_gpu=1, - workers_per_gpu=4, - train=dict( - type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_train.pkl', - pipeline=train_pipeline, - classes=class_names, - modality=input_modality, - test_mode=False, - use_valid_flag=True, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR'), - val=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality), - test=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality)) - -optimizer = dict( - type='AdamW', - lr=2e-4, - paramwise_cfg=dict( - custom_keys={ - 'img_backbone': dict(lr_mult=0.1), - }), - weight_decay=0.01) -optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) -# learning policy -lr_config = dict( - policy='CosineAnnealing', - warmup='linear', - warmup_iters=500, - warmup_ratio=1.0 / 3, - min_lr_ratio=1e-3) -total_epochs = 24 -evaluation = dict(interval=2, pipeline=test_pipeline) - -runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) -load_from='ckpts/fcos3d.pth' diff --git a/projects/configs/detr3d/detr3d_res101_gridmask_cbgs.py b/projects/configs/detr3d/detr3d_res101_gridmask_cbgs.py deleted file mode 100755 index b23d7b3..0000000 --- a/projects/configs/detr3d/detr3d_res101_gridmask_cbgs.py +++ /dev/null @@ -1,228 +0,0 @@ -_base_ = [ - '../../../mmdetection3d/configs/_base_/datasets/nus-3d.py', - '../../../mmdetection3d/configs/_base_/default_runtime.py' -] - -plugin=True -plugin_dir='projects/mmdet3d_plugin/' - -# If point cloud range is changed, the models should also change their point -# cloud range accordingly -point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] -voxel_size = [0.2, 0.2, 8] - -img_norm_cfg = dict( - mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) -# For nuScenes we usually do 10-class detection -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' -] - -input_modality = dict( - use_lidar=False, - use_camera=True, - use_radar=False, - use_map=False, - use_external=False) - -model = dict( - type='Detr3D', - use_grid_mask=True, - img_backbone=dict( - type='ResNet', - depth=101, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN2d', requires_grad=False), - norm_eval=True, - style='caffe', - dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), - stage_with_dcn=(False, False, True, True)), - img_neck=dict( - type='FPN', - in_channels=[256, 512, 1024, 2048], - out_channels=256, - start_level=1, - add_extra_convs='on_output', - num_outs=4, - relu_before_extra_convs=True), - pts_bbox_head=dict( - type='Detr3DHead', - num_query=900, - num_classes=10, - in_channels=256, - sync_cls_avg_factor=True, - with_box_refine=True, - as_two_stage=False, - transformer=dict( - type='Detr3DTransformer', - decoder=dict( - type='Detr3DTransformerDecoder', - num_layers=6, - return_intermediate=True, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=[ - dict( - type='MultiheadAttention', - embed_dims=256, - num_heads=8, - dropout=0.1), - dict( - type='Detr3DCrossAtten', - pc_range=point_cloud_range, - num_points=1, - embed_dims=256) - ], - feedforward_channels=512, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm')))), - bbox_coder=dict( - type='NMSFreeCoder', - post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - pc_range=point_cloud_range, - max_num=300, - voxel_size=voxel_size, - num_classes=10), - positional_encoding=dict( - type='SinePositionalEncoding', - num_feats=128, - normalize=True, - offset=-0.5), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=2.0), - loss_bbox=dict(type='L1Loss', loss_weight=0.25), - loss_iou=dict(type='GIoULoss', loss_weight=0.0)), - # model training and testing settings - train_cfg=dict(pts=dict( - grid_size=[512, 512, 1], - voxel_size=voxel_size, - point_cloud_range=point_cloud_range, - out_size_factor=4, - assigner=dict( - type='HungarianAssigner3D', - cls_cost=dict(type='FocalLossCost', weight=2.0), - reg_cost=dict(type='BBox3DL1Cost', weight=0.25), - iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. - pc_range=point_cloud_range)))) - -dataset_type = 'NuScenesDataset' -data_root = 'data/nuscenes/' - -file_client_args = dict(backend='disk') - -db_sampler = dict( - data_root=data_root, - info_path=data_root + 'nuscenes_dbinfos_train.pkl', - rate=1.0, - prepare=dict( - filter_by_difficulty=[-1], - filter_by_min_points=dict( - car=5, - truck=5, - bus=5, - trailer=5, - construction_vehicle=5, - traffic_cone=5, - barrier=5, - motorcycle=5, - bicycle=5, - pedestrian=5)), - classes=class_names, - sample_groups=dict( - car=2, - truck=3, - construction_vehicle=7, - bus=4, - trailer=6, - barrier=2, - motorcycle=6, - bicycle=6, - pedestrian=2, - traffic_cone=2), - points_loader=dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=[0, 1, 2, 3, 4], - file_client_args=file_client_args)) - -train_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='PhotoMetricDistortionMultiViewImage'), - dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), - dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectNameFilter', classes=class_names), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - dict(type='PadMultiViewImage', size_divisor=32), - dict(type='DefaultFormatBundle3D', class_names=class_names), - dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) -] -test_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - dict(type='PadMultiViewImage', size_divisor=32), - dict( - type='MultiScaleFlipAug3D', - img_scale=(1333, 800), - pts_scale_ratio=1, - flip=False, - transforms=[ - dict( - type='DefaultFormatBundle3D', - class_names=class_names, - with_label=False), - dict(type='Collect3D', keys=['img']) - ]) -] - - -data = dict( - samples_per_gpu=1, - workers_per_gpu=4, - train=dict( - type='CBGSDataset', - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_train.pkl', - pipeline=train_pipeline, - classes=class_names, - modality=input_modality, - test_mode=False, - use_valid_flag=True, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR'), - ), - val=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality), - test=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality)) - -optimizer = dict( - type='AdamW', - lr=2e-4, - paramwise_cfg=dict( - custom_keys={ - 'img_backbone': dict(lr_mult=0.1), - }), - weight_decay=0.01) -optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) -# learning policy -lr_config = dict( - policy='CosineAnnealing', - warmup='linear', - warmup_iters=500, - warmup_ratio=1.0 / 3, - min_lr_ratio=1e-3) -total_epochs = 24 -evaluation = dict(interval=2, pipeline=test_pipeline) - -runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) -load_from='pretrained/fcos3d.pth' diff --git a/projects/configs/detr3d/detr3d_vovnet_gridmask_det_final_trainval_cbgs.py b/projects/configs/detr3d/detr3d_vovnet_gridmask_det_final_trainval_cbgs.py deleted file mode 100755 index 4366532..0000000 --- a/projects/configs/detr3d/detr3d_vovnet_gridmask_det_final_trainval_cbgs.py +++ /dev/null @@ -1,227 +0,0 @@ -_base_ = [ - '../../../mmdetection3d/configs/_base_/datasets/nus-3d.py', - '../../../mmdetection3d/configs/_base_/default_runtime.py' -] - -plugin=True -plugin_dir='projects/mmdet3d_plugin/' - -# If point cloud range is changed, the models should also change their point -# cloud range accordingly -point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] -voxel_size = [0.2, 0.2, 8] - -img_norm_cfg = dict( - mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False) -# For nuScenes we usually do 10-class detection -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' -] - -input_modality = dict( - use_lidar=False, - use_camera=True, - use_radar=False, - use_map=False, - use_external=True) - -model = dict( - type='Detr3D', - use_grid_mask=True, - img_backbone=dict( - type='VoVNet', - spec_name='V-99-eSE', - norm_eval=True, - frozen_stages=1, - input_ch=3, - out_features=['stage2', 'stage3', 'stage4', 'stage5']), - img_neck=dict( - type='FPN', - in_channels=[256, 512, 768, 1024], - out_channels=256, - start_level=0, - add_extra_convs='on_output', - num_outs=4, - relu_before_extra_convs=True), - pts_bbox_head=dict( - type='Detr3DHead', - num_query=900, - num_classes=10, - in_channels=256, - sync_cls_avg_factor=True, - with_box_refine=True, - as_two_stage=False, - code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], - transformer=dict( - type='Detr3DTransformer', - decoder=dict( - type='Detr3DTransformerDecoder', - num_layers=6, - return_intermediate=True, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=[ - dict( - type='MultiheadAttention', - embed_dims=256, - num_heads=8, - dropout=0.1), - dict( - type='Detr3DCrossAtten', - pc_range=point_cloud_range, - num_points=1, - embed_dims=256) - ], - feedforward_channels=512, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm')))), - bbox_coder=dict( - type='NMSFreeCoder', - post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - pc_range=point_cloud_range, - max_num=300, - voxel_size=voxel_size, - num_classes=10), - positional_encoding=dict( - type='SinePositionalEncoding', - num_feats=128, - normalize=True, - offset=-0.5), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=2.0), - loss_bbox=dict(type='L1Loss', loss_weight=0.25), - loss_iou=dict(type='GIoULoss', loss_weight=0.0)), - # model training and testing settings - train_cfg=dict(pts=dict( - grid_size=[512, 512, 1], - voxel_size=voxel_size, - point_cloud_range=point_cloud_range, - out_size_factor=4, - assigner=dict( - type='HungarianAssigner3D', - cls_cost=dict(type='FocalLossCost', weight=2.0), - reg_cost=dict(type='BBox3DL1Cost', weight=0.25), - iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. - pc_range=point_cloud_range)))) - -dataset_type = 'NuScenesDataset' -data_root = 'data/nuscenes/' - -file_client_args = dict(backend='disk') - -db_sampler = dict( - data_root=data_root, - info_path=data_root + 'nuscenes_dbinfos_train.pkl', - rate=1.0, - prepare=dict( - filter_by_difficulty=[-1], - filter_by_min_points=dict( - car=5, - truck=5, - bus=5, - trailer=5, - construction_vehicle=5, - traffic_cone=5, - barrier=5, - motorcycle=5, - bicycle=5, - pedestrian=5)), - classes=class_names, - sample_groups=dict( - car=2, - truck=3, - construction_vehicle=7, - bus=4, - trailer=6, - barrier=2, - motorcycle=6, - bicycle=6, - pedestrian=2, - traffic_cone=2), - points_loader=dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=[0, 1, 2, 3, 4], - file_client_args=file_client_args)) - -train_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='PhotoMetricDistortionMultiViewImage'), - dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), - dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectNameFilter', classes=class_names), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - dict(type='PadMultiViewImage', size_divisor=32), - dict(type='DefaultFormatBundle3D', class_names=class_names), - dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) -] -test_pipeline = [ - dict(type='LoadMultiViewImageFromFiles', to_float32=True), - dict(type='NormalizeMultiviewImage', **img_norm_cfg), - dict(type='PadMultiViewImage', size_divisor=32), - dict( - type='MultiScaleFlipAug3D', - img_scale=(1333, 800), - pts_scale_ratio=1, - flip=False, - transforms=[ - dict( - type='DefaultFormatBundle3D', - class_names=class_names, - with_label=False), - dict(type='Collect3D', keys=['img']) - ]) -] - - -data = dict( - samples_per_gpu=1, - workers_per_gpu=4, - train=dict( - type='CBGSDataset', - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_trainval.pkl', - pipeline=train_pipeline, - classes=class_names, - modality=input_modality, - test_mode=False, - use_valid_flag=True, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR'), - ), - val=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality), - test=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality)) - -optimizer = dict( - type='AdamW', - lr=2e-4, - paramwise_cfg=dict( - custom_keys={ - 'img_backbone': dict(lr_mult=0.1), - }), - weight_decay=0.01) -optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) -# learning policy -lr_config = dict( - policy='CosineAnnealing', - warmup='linear', - warmup_iters=500, - warmup_ratio=1.0 / 3, - min_lr_ratio=1e-3) -total_epochs = 24 -evaluation = dict(interval=2, pipeline=test_pipeline) - -runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) -load_from='ckpts/dd3d_det_final.pth' - -find_unused_parameters=True diff --git a/projects/configs/obj_dgcnn/pillar.py b/projects/configs/obj_dgcnn/pillar.py deleted file mode 100755 index 39485de..0000000 --- a/projects/configs/obj_dgcnn/pillar.py +++ /dev/null @@ -1,295 +0,0 @@ -_base_ = [ - '../../../mmdetection3d/configs/_base_/datasets/nus-3d.py', - '../../../mmdetection3d/configs/_base_/schedules/cyclic_20e.py', - '../../../mmdetection3d/configs/_base_/default_runtime.py' -] - -plugin=True -plugin_dir='projects/mmdet3d_plugin/' - -# If point cloud range is changed, the models should also change their point -# cloud range accordingly -point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] -voxel_size = [0.2, 0.2, 8] - -# For nuScenes we usually do 10-class detection -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' -] - -input_modality = dict( - use_lidar=True, - use_camera=False, - use_radar=False, - use_map=False, - use_external=False) - -model = dict( - type='ObjDGCNN', - pts_voxel_layer=dict( - max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000), - point_cloud_range=point_cloud_range), - pts_voxel_encoder=dict( - type='PillarFeatureNet', - in_channels=5, - feat_channels=[64], - with_distance=False, - voxel_size=(0.2, 0.2, 8), - norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), - point_cloud_range=point_cloud_range, - legacy=False), - pts_middle_encoder=dict( - type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)), - pts_backbone=dict( - type='SECOND', - in_channels=64, - out_channels=[64, 128, 256], - layer_nums=[3, 5, 5], - layer_strides=[2, 2, 2], - norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), - conv_cfg=dict(type='Conv2d', bias=False)), - pts_neck=dict( - type='FPN', - norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.01), - act_cfg=dict(type='ReLU'), - in_channels=[64, 128, 256], - out_channels=256, - start_level=0, - num_outs=4), - pts_bbox_head=dict( - type='DGCNN3DHead', - num_query=300, - num_classes=10, - in_channels=256, - sync_cls_avg_factor=True, - with_box_refine=True, - as_two_stage=False, - transformer=dict( - type='DeformableDetrTransformer', - encoder=dict( - type='DetrTransformerEncoder', - num_layers=2, - transformerlayers=dict( - type='BaseTransformerLayer', - attn_cfgs=dict( - type='MultiScaleDeformableAttention', embed_dims=256), - feedforward_channels=512, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'ffn', 'norm'))), - decoder=dict( - type='Deformable3DDetrTransformerDecoder', - num_layers=6, - return_intermediate=True, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=[ - dict( - type='DGCNNAttn', - embed_dims=256, - num_heads=8, - K=16, - dropout=0.1), - dict( - type='MultiScaleDeformableAttention', - embed_dims=256) - ], - feedforward_channels=512, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm')))), - bbox_coder=dict( - type='NMSFreeCoder', - post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - pc_range=point_cloud_range, - max_num=300, - voxel_size=voxel_size, - num_classes=10), - positional_encoding=dict( - type='SinePositionalEncoding', - num_feats=128, - normalize=True, - offset=-0.5), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=2.0), - loss_bbox=dict(type='L1Loss', loss_weight=0.25), - loss_iou=dict(type='GIoULoss', loss_weight=0.0)), # For DETR compatibility. - # model training and testing settings - train_cfg=dict(pts=dict( - grid_size=[512, 512, 1], - voxel_size=voxel_size, - point_cloud_range=point_cloud_range, - out_size_factor=4, - assigner=dict( - type='HungarianAssigner3D', - cls_cost=dict(type='FocalLossCost', weight=2.0), - reg_cost=dict(type='BBox3DL1Cost', weight=0.25), - iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. - pc_range=point_cloud_range)))) - -dataset_type = 'NuScenesDataset' -data_root = 'data/nuscenes/' - -file_client_args = dict(backend='disk') - -db_sampler = dict( - data_root=data_root, - info_path=data_root + 'nuscenes_dbinfos_train.pkl', - rate=1.0, - prepare=dict( - filter_by_difficulty=[-1], - filter_by_min_points=dict( - car=5, - truck=5, - bus=5, - trailer=5, - construction_vehicle=5, - traffic_cone=5, - barrier=5, - motorcycle=5, - bicycle=5, - pedestrian=5)), - classes=class_names, - sample_groups=dict( - car=2, - truck=3, - construction_vehicle=7, - bus=4, - trailer=6, - barrier=2, - motorcycle=6, - bicycle=6, - pedestrian=2, - traffic_cone=2), - points_loader=dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=[0, 1, 2, 3, 4], - file_client_args=file_client_args)) - -train_pipeline = [ - dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=5, - file_client_args=file_client_args), - dict( - type='LoadPointsFromMultiSweeps', - sweeps_num=9, - use_dim=[0, 1, 2, 3, 4], - file_client_args=file_client_args, - pad_empty_sweeps=True, - remove_close=True), - dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), - dict( - type='GlobalRotScaleTrans', - rot_range=[-0.3925, 0.3925], - scale_ratio_range=[0.95, 1.05], - translation_std=[0, 0, 0]), - dict( - type='RandomFlip3D', - sync_2d=False, - flip_ratio_bev_horizontal=0.5, - flip_ratio_bev_vertical=0.5), - dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectNameFilter', classes=class_names), - dict(type='PointShuffle'), - dict(type='DefaultFormatBundle3D', class_names=class_names), - dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=5, - file_client_args=file_client_args), - dict( - type='LoadPointsFromMultiSweeps', - sweeps_num=9, - use_dim=[0, 1, 2, 3, 4], - file_client_args=file_client_args, - pad_empty_sweeps=True, - remove_close=True), - dict( - type='MultiScaleFlipAug3D', - img_scale=(1333, 800), - pts_scale_ratio=1, - flip=False, - transforms=[ - dict( - type='DefaultFormatBundle3D', - class_names=class_names, - with_label=False), - dict(type='Collect3D', keys=['points']) - ]) -] - -# construct a pipeline for data and gt loading in show function -# please keep its loading function consistent with test_pipeline (e.g. client) -eval_pipeline = [ - dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=5, - file_client_args=file_client_args), - dict( - type='LoadPointsFromMultiSweeps', - sweeps_num=9, - use_dim=[0, 1, 2, 3, 4], - file_client_args=file_client_args, - pad_empty_sweeps=True, - remove_close=True), - dict( - type='DefaultFormatBundle3D', - class_names=class_names, - with_label=False), - dict(type='Collect3D', keys=['points']) -] - -data = dict( - samples_per_gpu=3, - workers_per_gpu=4, - train=dict( - type='CBGSDataset', - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_train.pkl', - pipeline=train_pipeline, - classes=class_names, - modality=input_modality, - test_mode=False, - use_valid_flag=True, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR'), - ), - val=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality), - test=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality)) - -evaluation = dict(interval=20, pipeline=eval_pipeline) - -runner = dict(type='EpochBasedRunner', max_epochs=20) - -optimizer = dict( - type='AdamW', - lr=1e-4, - paramwise_cfg=dict( - custom_keys={ - 'pts_voxel_encoder': dict(lr_mult=0.1), - 'SECOND': dict(lr_mult=0.1), - }), - weight_decay=0.01) -optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) - -load_from='ckpts/pillar.pth' -find_unused_parameters = True \ No newline at end of file diff --git a/projects/configs/obj_dgcnn/voxel.py b/projects/configs/obj_dgcnn/voxel.py deleted file mode 100755 index a7a76be..0000000 --- a/projects/configs/obj_dgcnn/voxel.py +++ /dev/null @@ -1,293 +0,0 @@ -_base_ = [ - '../../../mmdetection3d/configs/_base_/datasets/nus-3d.py', - '../../../mmdetection3d/configs/_base_/schedules/cyclic_20e.py', - '../../../mmdetection3d/configs/_base_/default_runtime.py' -] - -plugin=True -plugin_dir='projects/mmdet3d_plugin/' - -# If point cloud range is changed, the models should also change their point -# cloud range accordingly -point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] -voxel_size = [0.1, 0.1, 0.2] - -# For nuScenes we usually do 10-class detection -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' -] - -input_modality = dict( - use_lidar=True, - use_camera=False, - use_radar=False, - use_map=False, - use_external=False) - -model = dict( - type='ObjDGCNN', - pts_voxel_layer=dict( - max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000), - point_cloud_range=point_cloud_range), - pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), - pts_middle_encoder=dict( - type='SparseEncoder', - in_channels=5, - sparse_shape=[41, 1024, 1024], - output_channels=128, - order=('conv', 'norm', 'act'), - encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, - 128)), - encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), - block_type='basicblock'), - pts_backbone=dict( - type='SECOND', - in_channels=256, - out_channels=[128, 256], - layer_nums=[5, 5], - layer_strides=[1, 2], - norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), - conv_cfg=dict(type='Conv2d', bias=False)), - pts_neck=dict( - type='FPN', - norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.01), - act_cfg=dict(type='ReLU'), - in_channels=[128, 256], - out_channels=256, - start_level=0, - num_outs=4), - pts_bbox_head=dict( - type='DGCNN3DHead', - num_query=300, - num_classes=10, - in_channels=256, - sync_cls_avg_factor=True, - with_box_refine=True, - as_two_stage=False, - transformer=dict( - type='DeformableDetrTransformer', - encoder=dict( - type='DetrTransformerEncoder', - num_layers=2, - transformerlayers=dict( - type='BaseTransformerLayer', - attn_cfgs=dict( - type='MultiScaleDeformableAttention', embed_dims=256), - feedforward_channels=512, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'ffn', 'norm'))), - decoder=dict( - type='Deformable3DDetrTransformerDecoder', - num_layers=6, - return_intermediate=True, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=[ - dict( - type='DGCNNAttn', - embed_dims=256, - num_heads=8, - K=16, - dropout=0.1), - dict( - type='MultiScaleDeformableAttention', - embed_dims=256) - ], - feedforward_channels=512, - ffn_dropout=0.1, - operation_order=('self_attn', 'norm', 'cross_attn', 'norm', - 'ffn', 'norm')))), - bbox_coder=dict( - type='NMSFreeCoder', - post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - pc_range=point_cloud_range, - max_num=300, - voxel_size=voxel_size, - num_classes=10), - positional_encoding=dict( - type='SinePositionalEncoding', - num_feats=128, - normalize=True, - offset=-0.5), - loss_cls=dict( - type='FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=2.0), - loss_bbox=dict(type='L1Loss', loss_weight=0.25), - loss_iou=dict(type='GIoULoss', loss_weight=0.0)), # For DETR compatibility. - # model training and testing settings - train_cfg=dict(pts=dict( - grid_size=[512, 512, 1], - voxel_size=voxel_size, - point_cloud_range=point_cloud_range, - out_size_factor=4, - assigner=dict( - type='HungarianAssigner3D', - cls_cost=dict(type='FocalLossCost', weight=2.0), - reg_cost=dict(type='BBox3DL1Cost', weight=0.25), - iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. - pc_range=point_cloud_range)))) - -dataset_type = 'NuScenesDataset' -data_root = 'data/nuscenes/' - -file_client_args = dict(backend='disk') - -db_sampler = dict( - data_root=data_root, - info_path=data_root + 'nuscenes_dbinfos_train.pkl', - rate=1.0, - prepare=dict( - filter_by_difficulty=[-1], - filter_by_min_points=dict( - car=5, - truck=5, - bus=5, - trailer=5, - construction_vehicle=5, - traffic_cone=5, - barrier=5, - motorcycle=5, - bicycle=5, - pedestrian=5)), - classes=class_names, - sample_groups=dict( - car=2, - truck=3, - construction_vehicle=7, - bus=4, - trailer=6, - barrier=2, - motorcycle=6, - bicycle=6, - pedestrian=2, - traffic_cone=2), - points_loader=dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=[0, 1, 2, 3, 4], - file_client_args=file_client_args)) - -train_pipeline = [ - dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=5, - file_client_args=file_client_args), - dict( - type='LoadPointsFromMultiSweeps', - sweeps_num=9, - use_dim=[0, 1, 2, 3, 4], - file_client_args=file_client_args, - pad_empty_sweeps=True, - remove_close=True), - dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), - dict( - type='GlobalRotScaleTrans', - rot_range=[-0.3925, 0.3925], - scale_ratio_range=[0.95, 1.05], - translation_std=[0, 0, 0]), - dict( - type='RandomFlip3D', - sync_2d=False, - flip_ratio_bev_horizontal=0.5, - flip_ratio_bev_vertical=0.5), - dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectNameFilter', classes=class_names), - dict(type='PointShuffle'), - dict(type='DefaultFormatBundle3D', class_names=class_names), - dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=5, - file_client_args=file_client_args), - dict( - type='LoadPointsFromMultiSweeps', - sweeps_num=9, - use_dim=[0, 1, 2, 3, 4], - file_client_args=file_client_args, - pad_empty_sweeps=True, - remove_close=True), - dict( - type='MultiScaleFlipAug3D', - img_scale=(1333, 800), - pts_scale_ratio=1, - flip=False, - transforms=[ - dict( - type='DefaultFormatBundle3D', - class_names=class_names, - with_label=False), - dict(type='Collect3D', keys=['points']) - ]) -] - -# construct a pipeline for data and gt loading in show function -# please keep its loading function consistent with test_pipeline (e.g. client) -eval_pipeline = [ - dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=5, - file_client_args=file_client_args), - dict( - type='LoadPointsFromMultiSweeps', - sweeps_num=9, - use_dim=[0, 1, 2, 3, 4], - file_client_args=file_client_args, - pad_empty_sweeps=True, - remove_close=True), - dict( - type='DefaultFormatBundle3D', - class_names=class_names, - with_label=False), - dict(type='Collect3D', keys=['points']) -] - -data = dict( - samples_per_gpu=4, - workers_per_gpu=6, - train=dict( - type='CBGSDataset', - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_train.pkl', - pipeline=train_pipeline, - classes=class_names, - modality=input_modality, - test_mode=False, - use_valid_flag=True, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR'), - ), - val=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality), - test=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality)) - -evaluation = dict(interval=20, pipeline=eval_pipeline) - -runner = dict(type='EpochBasedRunner', max_epochs=20) -optimizer = dict( - type='AdamW', - lr=1e-4, - paramwise_cfg=dict( - custom_keys={ - 'pts_voxel_encoder': dict(lr_mult=0.1), - 'SECOND': dict(lr_mult=0.1), - }), - weight_decay=0.01) -optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) -load_from='pretrained/voxel.pth' -find_unused_parameters = True \ No newline at end of file diff --git a/projects/mmdet3d_plugin/__init__.py b/projects/mmdet3d_plugin/__init__.py old mode 100755 new mode 100644 index b0e5d56..8c4ea6d --- a/projects/mmdet3d_plugin/__init__.py +++ b/projects/mmdet3d_plugin/__init__.py @@ -1,16 +1,11 @@ -from .core.bbox.assigners.hungarian_assigner_3d import HungarianAssigner3D -from .core.bbox.coders.nms_free_coder import NMSFreeCoder -from .core.bbox.match_costs import BBox3DL1Cost -from .datasets import CustomNuScenesDataset -from .datasets.pipelines import ( - PhotoMetricDistortionMultiViewImage, PadMultiViewImage, - NormalizeMultiviewImage, CropMultiViewImage, RandomScaleImageMultiViewImage, - HorizontalRandomFlipMultiViewImage) +from .datasets.transforms import PhotoMetricDistortionMultiViewImage +# from .datasets import custom_nuscenes from .models.backbones.vovnet import VoVNet -from .models.detectors.obj_dgcnn import ObjDGCNN -from .models.detectors.detr3d import Detr3D -from .models.dense_heads.dgcnn3d_head import DGCNN3DHead from .models.dense_heads.detr3d_head import Detr3DHead -from .models.utils.detr import Deformable3DDetrTransformerDecoder -from .models.utils.dgcnn_attn import DGCNNAttn -from .models.utils.detr3d_transformer import Detr3DTransformer, Detr3DTransformerDecoder, Detr3DCrossAtten +from .models.detectors.detr3d import Detr3D +from .models.task_modules.hungarian_assigner_3d import HungarianAssigner3D +from .models.task_modules.match_cost import BBox3DL1Cost +from .models.task_modules.nms_free_coder import NMSFreeCoder +from .models.utils.detr3d_transformer import (Detr3DCrossAtten, + Detr3DTransformer, + Detr3DTransformerDecoder) diff --git a/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py b/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py deleted file mode 100755 index 23156d8..0000000 --- a/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .hungarian_assigner_3d import HungarianAssigner3D - -__all__ = ['HungarianAssigner3D'] diff --git a/projects/mmdet3d_plugin/core/bbox/coders/__init__.py b/projects/mmdet3d_plugin/core/bbox/coders/__init__.py deleted file mode 100755 index c670f5e..0000000 --- a/projects/mmdet3d_plugin/core/bbox/coders/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .nms_free_coder import NMSFreeCoder - -__all__ = ['NMSFreeCoder'] diff --git a/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py b/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py deleted file mode 100755 index aac1a82..0000000 --- a/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from mmdet.core.bbox.match_costs import build_match_cost -from .match_cost import BBox3DL1Cost - -__all__ = ['build_match_cost', 'BBox3DL1Cost'] \ No newline at end of file diff --git a/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py b/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py deleted file mode 100755 index d9678f3..0000000 --- a/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py +++ /dev/null @@ -1,27 +0,0 @@ -import torch -from mmdet.core.bbox.match_costs.builder import MATCH_COST - - -@MATCH_COST.register_module() -class BBox3DL1Cost(object): - """BBox3DL1Cost. - Args: - weight (int | float, optional): loss_weight - """ - - def __init__(self, weight=1.): - self.weight = weight - - def __call__(self, bbox_pred, gt_bboxes): - """ - Args: - bbox_pred (Tensor): Predicted boxes with normalized coordinates - (cx, cy, w, h), which are all in range [0, 1]. Shape - [num_query, 4]. - gt_bboxes (Tensor): Ground truth boxes with normalized - coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. - Returns: - torch.Tensor: bbox_cost value with weight - """ - bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) - return bbox_cost * self.weight \ No newline at end of file diff --git a/projects/mmdet3d_plugin/core/bbox/util.py b/projects/mmdet3d_plugin/core/bbox/util.py deleted file mode 100755 index 764df44..0000000 --- a/projects/mmdet3d_plugin/core/bbox/util.py +++ /dev/null @@ -1,53 +0,0 @@ -import torch - - -def normalize_bbox(bboxes, pc_range): - - cx = bboxes[..., 0:1] - cy = bboxes[..., 1:2] - cz = bboxes[..., 2:3] - w = bboxes[..., 3:4].log() - l = bboxes[..., 4:5].log() - h = bboxes[..., 5:6].log() - - rot = bboxes[..., 6:7] - if bboxes.size(-1) > 7: - vx = bboxes[..., 7:8] - vy = bboxes[..., 8:9] - normalized_bboxes = torch.cat( - (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1 - ) - else: - normalized_bboxes = torch.cat( - (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1 - ) - return normalized_bboxes - -def denormalize_bbox(normalized_bboxes, pc_range): - # rotation - rot_sine = normalized_bboxes[..., 6:7] - - rot_cosine = normalized_bboxes[..., 7:8] - rot = torch.atan2(rot_sine, rot_cosine) - - # center in the bev - cx = normalized_bboxes[..., 0:1] - cy = normalized_bboxes[..., 1:2] - cz = normalized_bboxes[..., 4:5] - - # size - w = normalized_bboxes[..., 2:3] - l = normalized_bboxes[..., 3:4] - h = normalized_bboxes[..., 5:6] - - w = w.exp() - l = l.exp() - h = h.exp() - if normalized_bboxes.size(-1) > 8: - # velocity - vx = normalized_bboxes[:, 8:9] - vy = normalized_bboxes[:, 9:10] - denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1) - else: - denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) - return denormalized_bboxes \ No newline at end of file diff --git a/projects/mmdet3d_plugin/datasets/__init__.py b/projects/mmdet3d_plugin/datasets/__init__.py index 99b36cb..ad6c3f9 100644 --- a/projects/mmdet3d_plugin/datasets/__init__.py +++ b/projects/mmdet3d_plugin/datasets/__init__.py @@ -1,5 +1 @@ -from .nuscenes_dataset import CustomNuScenesDataset - -__all__ = [ - 'CustomNuScenesDataset' -] +# from .custom_nus import custom_nuscenes diff --git a/projects/mmdet3d_plugin/datasets/custom_nus.py b/projects/mmdet3d_plugin/datasets/custom_nus.py new file mode 100644 index 0000000..75d93dc --- /dev/null +++ b/projects/mmdet3d_plugin/datasets/custom_nus.py @@ -0,0 +1,19 @@ +# from mmdet3d.datasets.nuscenes_dataset import NuScenesDataset +# from typing import Callable, List, Union +# from mmdet3d.structures.bbox_3d.utils import get_lidar2img +# import torch + +# from mmdet3d.registry import DATASETS +# @DATASETS.register_module() +# class custom_nuscenes(NuScenesDataset): +# def __init__(self, **kwargs): +# super().__init__(**kwargs) + +# def parse_data_info(self, info: dict) -> Union[List[dict], dict]: +# data_info = super().parse_data_info(info) +# for cam in data_info['images']: +# lidar2cam = torch.Tensor(data_info['images'][cam]['lidar2cam']) +# cam2img = torch.Tensor(data_info['images'][cam]['cam2img']) +# lidar2img = get_lidar2img(cam2img, lidar2cam) +# data_info['images'][cam]['lidar2img'] = lidar2img.numpy() +# return data_info diff --git a/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py b/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py deleted file mode 100644 index 1ec4cf2..0000000 --- a/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py +++ /dev/null @@ -1,76 +0,0 @@ -import numpy as np -from mmdet.datasets import DATASETS -from mmdet3d.datasets import NuScenesDataset - - -@DATASETS.register_module() -class CustomNuScenesDataset(NuScenesDataset): - r"""NuScenes Dataset. - - This datset only add camera intrinsics and extrinsics to the results. - """ - - def get_data_info(self, index): - """Get data info according to the given index. - - Args: - index (int): Index of the sample data to get. - - Returns: - dict: Data information that will be passed to the data \ - preprocessing pipelines. It includes the following keys: - - - sample_idx (str): Sample index. - - pts_filename (str): Filename of point clouds. - - sweeps (list[dict]): Infos of sweeps. - - timestamp (float): Sample timestamp. - - img_filename (str, optional): Image filename. - - lidar2img (list[np.ndarray], optional): Transformations \ - from lidar to different cameras. - - ann_info (dict): Annotation info. - """ - info = self.data_infos[index] - # standard protocal modified from SECOND.Pytorch - input_dict = dict( - sample_idx=info['token'], - pts_filename=info['lidar_path'], - sweeps=info['sweeps'], - timestamp=info['timestamp'] / 1e6, - ) - - if self.modality['use_camera']: - image_paths = [] - lidar2img_rts = [] - lidar2cam_rts = [] - cam_intrinsics = [] - for cam_type, cam_info in info['cams'].items(): - image_paths.append(cam_info['data_path']) - # obtain lidar to image transformation matrix - lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) - lidar2cam_t = cam_info[ - 'sensor2lidar_translation'] @ lidar2cam_r.T - lidar2cam_rt = np.eye(4) - lidar2cam_rt[:3, :3] = lidar2cam_r.T - lidar2cam_rt[3, :3] = -lidar2cam_t - intrinsic = cam_info['cam_intrinsic'] - viewpad = np.eye(4) - viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic - lidar2img_rt = (viewpad @ lidar2cam_rt.T) - lidar2img_rts.append(lidar2img_rt) - - cam_intrinsics.append(viewpad) - lidar2cam_rts.append(lidar2cam_rt.T) - - input_dict.update( - dict( - img_filename=image_paths, - lidar2img=lidar2img_rts, - cam_intrinsic=cam_intrinsics, - lidar2cam=lidar2cam_rts, - )) - - if not self.test_mode: - annos = self.get_ann_info(index) - input_dict['ann_info'] = annos - - return input_dict diff --git a/projects/mmdet3d_plugin/datasets/pipelines/__init__.py b/projects/mmdet3d_plugin/datasets/pipelines/__init__.py deleted file mode 100755 index a246a61..0000000 --- a/projects/mmdet3d_plugin/datasets/pipelines/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from .transform_3d import ( - PadMultiViewImage, NormalizeMultiviewImage, - PhotoMetricDistortionMultiViewImage, CropMultiViewImage, - RandomScaleImageMultiViewImage, - HorizontalRandomFlipMultiViewImage) - -__all__ = [ - 'PadMultiViewImage', 'NormalizeMultiviewImage', - 'PhotoMetricDistortionMultiViewImage', 'CropMultiViewImage', - 'RandomScaleImageMultiViewImage', 'HorizontalRandomFlipMultiViewImage' -] \ No newline at end of file diff --git a/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py b/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py deleted file mode 100755 index a0302f6..0000000 --- a/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py +++ /dev/null @@ -1,306 +0,0 @@ -import numpy as np -from numpy import random -import mmcv -from mmdet.datasets.builder import PIPELINES - - -@PIPELINES.register_module() -class PadMultiViewImage(object): - """Pad the multi-view image. - There are two padding modes: (1) pad to a fixed size and (2) pad to the - minimum size that is divisible by some number. - Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", - Args: - size (tuple, optional): Fixed padding size. - size_divisor (int, optional): The divisor of padded size. - pad_val (float, optional): Padding value, 0 by default. - """ - - def __init__(self, size=None, size_divisor=None, pad_val=0): - self.size = size - self.size_divisor = size_divisor - self.pad_val = pad_val - # only one of size and size_divisor should be valid - assert size is not None or size_divisor is not None - assert size is None or size_divisor is None - - def _pad_img(self, results): - """Pad images according to ``self.size``.""" - if self.size is not None: - padded_img = [mmcv.impad( - img, shape=self.size, pad_val=self.pad_val) for img in results['img']] - elif self.size_divisor is not None: - padded_img = [mmcv.impad_to_multiple( - img, self.size_divisor, pad_val=self.pad_val) for img in results['img']] - results['img'] = padded_img - results['img_shape'] = [img.shape for img in padded_img] - results['pad_shape'] = [img.shape for img in padded_img] - results['pad_fixed_size'] = self.size - results['pad_size_divisor'] = self.size_divisor - - def __call__(self, results): - """Call function to pad images, masks, semantic segmentation maps. - Args: - results (dict): Result dict from loading pipeline. - Returns: - dict: Updated result dict. - """ - self._pad_img(results) - return results - - def __repr__(self): - repr_str = self.__class__.__name__ - repr_str += f'(size={self.size}, ' - repr_str += f'size_divisor={self.size_divisor}, ' - repr_str += f'pad_val={self.pad_val})' - return repr_str - - -@PIPELINES.register_module() -class NormalizeMultiviewImage(object): - """Normalize the image. - Added key is "img_norm_cfg". - Args: - mean (sequence): Mean values of 3 channels. - std (sequence): Std values of 3 channels. - to_rgb (bool): Whether to convert the image from BGR to RGB, - default is true. - """ - - def __init__(self, mean, std, to_rgb=True): - self.mean = np.array(mean, dtype=np.float32) - self.std = np.array(std, dtype=np.float32) - self.to_rgb = to_rgb - - def __call__(self, results): - """Call function to normalize images. - Args: - results (dict): Result dict from loading pipeline. - Returns: - dict: Normalized results, 'img_norm_cfg' key is added into - result dict. - """ - results['img'] = [mmcv.imnormalize( - img, self.mean, self.std, self.to_rgb) for img in results['img']] - results['img_norm_cfg'] = dict( - mean=self.mean, std=self.std, to_rgb=self.to_rgb) - return results - - def __repr__(self): - repr_str = self.__class__.__name__ - repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})' - return repr_str - - -@PIPELINES.register_module() -class PhotoMetricDistortionMultiViewImage: - """Apply photometric distortion to image sequentially, every transformation - is applied with a probability of 0.5. The position of random contrast is in - second or second to last. - 1. random brightness - 2. random contrast (mode 0) - 3. convert color from BGR to HSV - 4. random saturation - 5. random hue - 6. convert color from HSV to BGR - 7. random contrast (mode 1) - 8. randomly swap channels - Args: - brightness_delta (int): delta of brightness. - contrast_range (tuple): range of contrast. - saturation_range (tuple): range of saturation. - hue_delta (int): delta of hue. - """ - - def __init__(self, - brightness_delta=32, - contrast_range=(0.5, 1.5), - saturation_range=(0.5, 1.5), - hue_delta=18): - self.brightness_delta = brightness_delta - self.contrast_lower, self.contrast_upper = contrast_range - self.saturation_lower, self.saturation_upper = saturation_range - self.hue_delta = hue_delta - - def __call__(self, results): - """Call function to perform photometric distortion on images. - Args: - results (dict): Result dict from loading pipeline. - Returns: - dict: Result dict with images distorted. - """ - imgs = results['img'] - new_imgs = [] - for img in imgs: - assert img.dtype == np.float32, \ - 'PhotoMetricDistortion needs the input image of dtype np.float32,'\ - ' please set "to_float32=True" in "LoadImageFromFile" pipeline' - # random brightness - if random.randint(2): - delta = random.uniform(-self.brightness_delta, - self.brightness_delta) - img += delta - - # mode == 0 --> do random contrast first - # mode == 1 --> do random contrast last - mode = random.randint(2) - if mode == 1: - if random.randint(2): - alpha = random.uniform(self.contrast_lower, - self.contrast_upper) - img *= alpha - - # convert color from BGR to HSV - img = mmcv.bgr2hsv(img) - - # random saturation - if random.randint(2): - img[..., 1] *= random.uniform(self.saturation_lower, - self.saturation_upper) - - # random hue - if random.randint(2): - img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta) - img[..., 0][img[..., 0] > 360] -= 360 - img[..., 0][img[..., 0] < 0] += 360 - - # convert color from HSV to BGR - img = mmcv.hsv2bgr(img) - - # random contrast - if mode == 0: - if random.randint(2): - alpha = random.uniform(self.contrast_lower, - self.contrast_upper) - img *= alpha - - # randomly swap channels - if random.randint(2): - img = img[..., random.permutation(3)] - new_imgs.append(img) - results['img'] = new_imgs - return results - - def __repr__(self): - repr_str = self.__class__.__name__ - repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' - repr_str += 'contrast_range=' - repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' - repr_str += 'saturation_range=' - repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' - repr_str += f'hue_delta={self.hue_delta})' - return repr_str - - -@PIPELINES.register_module() -class CropMultiViewImage(object): - """Crop the image - Args: - size (tuple, optional): Fixed padding size. - """ - - def __init__(self, size=None): - self.size = size - - def __call__(self, results): - """Call function to pad images, masks, semantic segmentation maps. - Args: - results (dict): Result dict from loading pipeline. - Returns: - dict: Updated result dict. - """ - results['img'] = [img[:self.size[0], :self.size[1], ...] for img in results['img']] - results['img_shape'] = [img.shape for img in results['img']] - results['img_fixed_size'] = self.size - return results - - def __repr__(self): - repr_str = self.__class__.__name__ - repr_str += f'(size={self.size}, ' - return repr_str - - -@PIPELINES.register_module() -class RandomScaleImageMultiViewImage(object): - """Random scale the image - Args: - scales - """ - - def __init__(self, scales=[0.5, 1.0, 1.5]): - self.scales = scales - - def __call__(self, results): - """Call function to pad images, masks, semantic segmentation maps. - Args: - results (dict): Result dict from loading pipeline. - Returns: - dict: Updated result dict. - """ - np.random.shuffle(self.scales) - rand_scale = self.scales[0] - img_shape = results['img_shape'][0] - y_size = int(img_shape[0] * rand_scale) - x_size = int(img_shape[1] * rand_scale) - scale_factor = np.eye(4) - scale_factor[0, 0] *= rand_scale - scale_factor[1, 1] *= rand_scale - results['img'] = [mmcv.imresize(img, (x_size, y_size), return_scale=False) for img in results['img']] - lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']] - results['lidar2img'] = lidar2img - results['img_shape'] = [img.shape for img in results['img']] - results['gt_bboxes_3d'].tensor[:, :6] *= rand_scale - return results - - def __repr__(self): - repr_str = self.__class__.__name__ - repr_str += f'(size={self.scales}, ' - return repr_str - - -@PIPELINES.register_module() -class HorizontalRandomFlipMultiViewImage(object): - - def __init__(self, flip_ratio=0.5): - self.flip_ratio = 0.5 - - def __call__(self, results): - if np.random.rand() >= self.flip_ratio: - return results - results = self.flip_bbox(results) - results = self.flip_cam_params(results) - results = self.flip_img(results) - return results - - def flip_img(self, results, direction='horizontal'): - results['img'] = [mmcv.imflip(img, direction) for img in results['img']] - return results - - def flip_cam_params(self, results): - flip_factor = np.eye(4) - flip_factor[1, 1] = -1 - lidar2cam = [l2c @ flip_factor for l2c in results['lidar2cam']] - w = results['img_shape'][0][1] - lidar2img = [] - for cam_intrinsic, l2c in zip(results['cam_intrinsic'], lidar2cam): - cam_intrinsic[0, 2] = w - cam_intrinsic[0, 2] - lidar2img.append(cam_intrinsic @ l2c) - results['lidar2cam'] = lidar2cam - results['lidar2img'] = lidar2img - return results - - def flip_bbox(self, input_dict, direction='horizontal'): - assert direction in ['horizontal', 'vertical'] - if len(input_dict['bbox3d_fields']) == 0: # test mode - input_dict['bbox3d_fields'].append('empty_box3d') - input_dict['empty_box3d'] = input_dict['box_type_3d']( - np.array([], dtype=np.float32)) - assert len(input_dict['bbox3d_fields']) == 1 - for key in input_dict['bbox3d_fields']: - if 'points' in input_dict: - input_dict['points'] = input_dict[key].flip( - direction, points=input_dict['points']) - else: - input_dict[key].flip(direction) - return input_dict - diff --git a/projects/mmdet3d_plugin/datasets/transforms/__init__.py b/projects/mmdet3d_plugin/datasets/transforms/__init__.py new file mode 100644 index 0000000..632b455 --- /dev/null +++ b/projects/mmdet3d_plugin/datasets/transforms/__init__.py @@ -0,0 +1 @@ +from .transform_3d import PhotoMetricDistortionMultiViewImage diff --git a/projects/mmdet3d_plugin/datasets/transforms/transform_3d.py b/projects/mmdet3d_plugin/datasets/transforms/transform_3d.py new file mode 100644 index 0000000..df46926 --- /dev/null +++ b/projects/mmdet3d_plugin/datasets/transforms/transform_3d.py @@ -0,0 +1,218 @@ +import mmcv +import numpy as np +from mmdet3d.registry import TRANSFORMS +from numpy import random + + +@TRANSFORMS.register_module() +class PhotoMetricDistortionMultiViewImage: + """Apply photometric distortion to image sequentially, every transformation + is applied with a probability of 0.5. + + The position of random contrast is in + second or second to last. + 1. random brightness + 2. random contrast (mode 0) + 3. convert color from BGR to HSV + 4. random saturation + 5. random hue + 6. convert color from HSV to BGR + 7. random contrast (mode 1) + 8. randomly swap channels + Args: + brightness_delta (int): delta of brightness. + contrast_range (tuple): range of contrast. + saturation_range (tuple): range of saturation. + hue_delta (int): delta of hue. + """ + + def __init__(self, + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18): + self.brightness_delta = brightness_delta + self.contrast_lower, self.contrast_upper = contrast_range + self.saturation_lower, self.saturation_upper = saturation_range + self.hue_delta = hue_delta + + def __call__(self, results): + """Call function to perform photometric distortion on images. + + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Result dict with images distorted. + """ + imgs = results['img'] + new_imgs = [] + for img in imgs: + assert img.dtype == np.float32, \ + 'PhotoMetricDistortion needs the input image of dtype np.float32,'\ + ' please set "to_float32=True" in "LoadImageFromFile" pipeline' + # random brightness + if random.randint(2): + delta = random.uniform(-self.brightness_delta, + self.brightness_delta) + img += delta + + # mode == 0 --> do random contrast first + # mode == 1 --> do random contrast last + mode = random.randint(2) + if mode == 1: + if random.randint(2): + alpha = random.uniform(self.contrast_lower, + self.contrast_upper) + img *= alpha + + # convert color from BGR to HSV + img = mmcv.bgr2hsv(img) + + # random saturation + if random.randint(2): + img[..., 1] *= random.uniform(self.saturation_lower, + self.saturation_upper) + + # random hue + if random.randint(2): + img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta) + img[..., 0][img[..., 0] > 360] -= 360 + img[..., 0][img[..., 0] < 0] += 360 + + # convert color from HSV to BGR + img = mmcv.hsv2bgr(img) + + # random contrast + if mode == 0: + if random.randint(2): + alpha = random.uniform(self.contrast_lower, + self.contrast_upper) + img *= alpha + + # randomly swap channels + if random.randint(2): + img = img[..., random.permutation(3)] + new_imgs.append(img) + results['img'] = new_imgs + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' + repr_str += 'contrast_range=' + repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' + repr_str += 'saturation_range=' + repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' + repr_str += f'hue_delta={self.hue_delta})' + return repr_str + + +# @TRANSFORMS.register_module() +# class CropMultiViewImage(object): +# """Crop the image +# Args: +# size (tuple, optional): Fixed padding size. +# """ + +# def __init__(self, size=None): +# self.size = size + +# def __call__(self, results): +# """Call function to pad images, masks, semantic segmentation maps. +# Args: +# results (dict): Result dict from loading pipeline. +# Returns: +# dict: Updated result dict. +# """ +# results['img'] = [img[:self.size[0], :self.size[1], ...] for img in results['img']] +# results['img_shape'] = [img.shape for img in results['img']] +# results['img_fixed_size'] = self.size +# return results + +# def __repr__(self): +# repr_str = self.__class__.__name__ +# repr_str += f'(size={self.size}, ' +# return repr_str + +# @TRANSFORMS.register_module() +# class RandomScaleImageMultiViewImage(object): +# """Random scale the image +# Args: +# scales +# """ + +# def __init__(self, scales=[0.5, 1.0, 1.5]): +# self.scales = scales + +# def __call__(self, results): +# """Call function to pad images, masks, semantic segmentation maps. +# Args: +# results (dict): Result dict from loading pipeline. +# Returns: +# dict: Updated result dict. +# """ +# np.random.shuffle(self.scales) +# rand_scale = self.scales[0] +# img_shape = results['img_shape'][0] +# y_size = int(img_shape[0] * rand_scale) +# x_size = int(img_shape[1] * rand_scale) +# scale_factor = np.eye(4) +# scale_factor[0, 0] *= rand_scale +# scale_factor[1, 1] *= rand_scale +# results['img'] = [mmcv.imresize(img, (x_size, y_size), return_scale=False) for img in results['img']] +# lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']] +# results['lidar2img'] = lidar2img +# results['img_shape'] = [img.shape for img in results['img']] +# results['gt_bboxes_3d'].tensor[:, :6] *= rand_scale +# return results + +# def __repr__(self): +# repr_str = self.__class__.__name__ +# repr_str += f'(size={self.scales}, ' +# return repr_str + +# @TRANSFORMS.register_module() +# class HorizontalRandomFlipMultiViewImage(object): + +# def __init__(self, flip_ratio=0.5): +# self.flip_ratio = 0.5 + +# def __call__(self, results): +# if np.random.rand() >= self.flip_ratio: +# return results +# results = self.flip_bbox(results) +# results = self.flip_cam_params(results) +# results = self.flip_img(results) +# return results + +# def flip_img(self, results, direction='horizontal'): +# results['img'] = [mmcv.imflip(img, direction) for img in results['img']] +# return results + +# def flip_cam_params(self, results): +# flip_factor = np.eye(4) +# flip_factor[1, 1] = -1 +# lidar2cam = [l2c @ flip_factor for l2c in results['lidar2cam']] +# w = results['img_shape'][0][1] +# lidar2img = [] +# for cam_intrinsic, l2c in zip(results['cam_intrinsic'], lidar2cam): +# cam_intrinsic[0, 2] = w - cam_intrinsic[0, 2] +# lidar2img.append(cam_intrinsic @ l2c) +# results['lidar2cam'] = lidar2cam +# results['lidar2img'] = lidar2img +# return results + +# def flip_bbox(self, input_dict, direction='horizontal'): +# assert direction in ['horizontal', 'vertical'] +# if len(input_dict['bbox3d_fields']) == 0: # test mode +# input_dict['bbox3d_fields'].append('empty_box3d') +# input_dict['empty_box3d'] = input_dict['box_type_3d']( +# np.array([], dtype=np.float32)) +# assert len(input_dict['bbox3d_fields']) == 1 +# for key in input_dict['bbox3d_fields']: +# if 'points' in input_dict: +# input_dict['points'] = input_dict[key].flip( +# direction, points=input_dict['points']) +# else: +# input_dict[key].flip(direction) +# return input_dict diff --git a/projects/mmdet3d_plugin/models/backbones/__init__.py b/projects/mmdet3d_plugin/models/backbones/__init__.py index cea72f5..418d62a 100755 --- a/projects/mmdet3d_plugin/models/backbones/__init__.py +++ b/projects/mmdet3d_plugin/models/backbones/__init__.py @@ -1,3 +1 @@ from .vovnet import VoVNet - -__all__ = ['VoVNet'] \ No newline at end of file diff --git a/projects/mmdet3d_plugin/models/backbones/vovnet.py b/projects/mmdet3d_plugin/models/backbones/vovnet.py index 879d186..3490597 100755 --- a/projects/mmdet3d_plugin/models/backbones/vovnet.py +++ b/projects/mmdet3d_plugin/models/backbones/vovnet.py @@ -1,31 +1,30 @@ - from collections import OrderedDict -from mmcv.runner import BaseModule -from mmdet.models.builder import BACKBONES + import torch import torch.nn as nn import torch.nn.functional as F +from mmdet3d.registry import MODELS +from mmengine.model import BaseModule from torch.nn.modules.batchnorm import _BatchNorm - VoVNet19_slim_dw_eSE = { 'stem': [64, 64, 64], 'stage_conv_ch': [64, 80, 96, 112], 'stage_out_ch': [112, 256, 384, 512], - "layer_per_block": 3, - "block_per_stage": [1, 1, 1, 1], - "eSE": True, - "dw": True + 'layer_per_block': 3, + 'block_per_stage': [1, 1, 1, 1], + 'eSE': True, + 'dw': True } VoVNet19_dw_eSE = { 'stem': [64, 64, 64], - "stage_conv_ch": [128, 160, 192, 224], - "stage_out_ch": [256, 512, 768, 1024], - "layer_per_block": 3, - "block_per_stage": [1, 1, 1, 1], - "eSE": True, - "dw": True + 'stage_conv_ch': [128, 160, 192, 224], + 'stage_out_ch': [256, 512, 768, 1024], + 'layer_per_block': 3, + 'block_per_stage': [1, 1, 1, 1], + 'eSE': True, + 'dw': True } VoVNet19_slim_eSE = { @@ -35,89 +34,103 @@ 'layer_per_block': 3, 'block_per_stage': [1, 1, 1, 1], 'eSE': True, - "dw": False + 'dw': False } VoVNet19_eSE = { 'stem': [64, 64, 128], - "stage_conv_ch": [128, 160, 192, 224], - "stage_out_ch": [256, 512, 768, 1024], - "layer_per_block": 3, - "block_per_stage": [1, 1, 1, 1], - "eSE": True, - "dw": False + 'stage_conv_ch': [128, 160, 192, 224], + 'stage_out_ch': [256, 512, 768, 1024], + 'layer_per_block': 3, + 'block_per_stage': [1, 1, 1, 1], + 'eSE': True, + 'dw': False } VoVNet39_eSE = { 'stem': [64, 64, 128], - "stage_conv_ch": [128, 160, 192, 224], - "stage_out_ch": [256, 512, 768, 1024], - "layer_per_block": 5, - "block_per_stage": [1, 1, 2, 2], - "eSE": True, - "dw": False + 'stage_conv_ch': [128, 160, 192, 224], + 'stage_out_ch': [256, 512, 768, 1024], + 'layer_per_block': 5, + 'block_per_stage': [1, 1, 2, 2], + 'eSE': True, + 'dw': False } VoVNet57_eSE = { 'stem': [64, 64, 128], - "stage_conv_ch": [128, 160, 192, 224], - "stage_out_ch": [256, 512, 768, 1024], - "layer_per_block": 5, - "block_per_stage": [1, 1, 4, 3], - "eSE": True, - "dw": False + 'stage_conv_ch': [128, 160, 192, 224], + 'stage_out_ch': [256, 512, 768, 1024], + 'layer_per_block': 5, + 'block_per_stage': [1, 1, 4, 3], + 'eSE': True, + 'dw': False } VoVNet99_eSE = { 'stem': [64, 64, 128], - "stage_conv_ch": [128, 160, 192, 224], - "stage_out_ch": [256, 512, 768, 1024], - "layer_per_block": 5, - "block_per_stage": [1, 3, 9, 3], - "eSE": True, - "dw": False + 'stage_conv_ch': [128, 160, 192, 224], + 'stage_out_ch': [256, 512, 768, 1024], + 'layer_per_block': 5, + 'block_per_stage': [1, 3, 9, 3], + 'eSE': True, + 'dw': False } _STAGE_SPECS = { - "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE, - "V-19-dw-eSE": VoVNet19_dw_eSE, - "V-19-slim-eSE": VoVNet19_slim_eSE, - "V-19-eSE": VoVNet19_eSE, - "V-39-eSE": VoVNet39_eSE, - "V-57-eSE": VoVNet57_eSE, - "V-99-eSE": VoVNet99_eSE, + 'V-19-slim-dw-eSE': VoVNet19_slim_dw_eSE, + 'V-19-dw-eSE': VoVNet19_dw_eSE, + 'V-19-slim-eSE': VoVNet19_slim_eSE, + 'V-19-eSE': VoVNet19_eSE, + 'V-39-eSE': VoVNet39_eSE, + 'V-57-eSE': VoVNet57_eSE, + 'V-99-eSE': VoVNet99_eSE, } -def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1): - """3x3 convolution with padding""" +def dw_conv3x3(in_channels, + out_channels, + module_name, + postfix, + stride=1, + kernel_size=3, + padding=1): + """3x3 convolution with padding.""" return [ - ( - '{}_{}/dw_conv3x3'.format(module_name, postfix), - nn.Conv2d( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - groups=out_channels, - bias=False - ) - ), - ( - '{}_{}/pw_conv1x1'.format(module_name, postfix), - nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False) - ), - ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)), + ('{}_{}/dw_conv3x3'.format(module_name, postfix), + nn.Conv2d(in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=out_channels, + bias=False)), + ('{}_{}/pw_conv1x1'.format(module_name, postfix), + nn.Conv2d(in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0, + groups=1, + bias=False)), + ('{}_{}/pw_norm'.format(module_name, + postfix), nn.BatchNorm2d(out_channels)), ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)), ] -def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1): - """3x3 convolution with padding""" +def conv3x3(in_channels, + out_channels, + module_name, + postfix, + stride=1, + groups=1, + kernel_size=3, + padding=1): + """3x3 convolution with padding.""" return [ ( - f"{module_name}_{postfix}/conv", + f'{module_name}_{postfix}/conv', nn.Conv2d( in_channels, out_channels, @@ -128,16 +141,23 @@ def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, bias=False, ), ), - (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), - (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), + (f'{module_name}_{postfix}/norm', nn.BatchNorm2d(out_channels)), + (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)), ] -def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0): - """1x1 convolution with padding""" +def conv1x1(in_channels, + out_channels, + module_name, + postfix, + stride=1, + groups=1, + kernel_size=1, + padding=0): + """1x1 convolution with padding.""" return [ ( - f"{module_name}_{postfix}/conv", + f'{module_name}_{postfix}/conv', nn.Conv2d( in_channels, out_channels, @@ -148,12 +168,13 @@ def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, bias=False, ), ), - (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), - (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), + (f'{module_name}_{postfix}/norm', nn.BatchNorm2d(out_channels)), + (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)), ] class Hsigmoid(nn.Module): + def __init__(self, inplace=True): super(Hsigmoid, self).__init__() self.inplace = inplace @@ -163,6 +184,7 @@ def forward(self, x): class eSEModule(nn.Module): + def __init__(self, channel, reduction=4): super(eSEModule, self).__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) @@ -178,9 +200,16 @@ def forward(self, x): class _OSA_module(nn.Module): - def __init__( - self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False - ): + + def __init__(self, + in_ch, + stage_ch, + concat_ch, + layer_per_block, + module_name, + SE=False, + identity=False, + depthwise=False): super(_OSA_module, self).__init__() @@ -192,18 +221,26 @@ def __init__( if self.depthwise and in_channel != stage_ch: self.isReduced = True self.conv_reduction = nn.Sequential( - OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0")) - ) + OrderedDict( + conv1x1(in_channel, stage_ch, + '{}_reduction'.format(module_name), '0'))) for i in range(layer_per_block): if self.depthwise: - self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i)))) + self.layers.append( + nn.Sequential( + OrderedDict( + dw_conv3x3(stage_ch, stage_ch, module_name, i)))) else: - self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i)))) + self.layers.append( + nn.Sequential( + OrderedDict( + conv3x3(in_channel, stage_ch, module_name, i)))) in_channel = stage_ch # feature aggregation in_channel = in_ch + layer_per_block * stage_ch - self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat"))) + self.concat = nn.Sequential( + OrderedDict(conv1x1(in_channel, concat_ch, module_name, 'concat'))) self.ese = eSEModule(concat_ch) @@ -231,44 +268,65 @@ def forward(self, x): class _OSA_stage(nn.Sequential): - def __init__( - self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False - ): + + def __init__(self, + in_ch, + stage_ch, + concat_ch, + block_per_stage, + layer_per_block, + stage_num, + SE=False, + depthwise=False): super(_OSA_stage, self).__init__() if not stage_num == 2: - self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)) + self.add_module( + 'Pooling', nn.MaxPool2d(kernel_size=3, + stride=2, + ceil_mode=True)) if block_per_stage != 1: SE = False - module_name = f"OSA{stage_num}_1" + module_name = f'OSA{stage_num}_1' self.add_module( - module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise) - ) + module_name, + _OSA_module(in_ch, + stage_ch, + concat_ch, + layer_per_block, + module_name, + SE, + depthwise=depthwise)) for i in range(block_per_stage - 1): if i != block_per_stage - 2: # last block SE = False - module_name = f"OSA{stage_num}_{i + 2}" + module_name = f'OSA{stage_num}_{i + 2}' self.add_module( module_name, - _OSA_module( - concat_ch, - stage_ch, - concat_ch, - layer_per_block, - module_name, - SE, - identity=True, - depthwise=depthwise - ), + _OSA_module(concat_ch, + stage_ch, + concat_ch, + layer_per_block, + module_name, + SE, + identity=True, + depthwise=depthwise), ) -@BACKBONES.register_module() +@MODELS.register_module() class VoVNet(BaseModule): - def __init__(self, spec_name, input_ch=3, out_features=None, - frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None): + + def __init__(self, + spec_name, + input_ch=3, + out_features=None, + frozen_stages=-1, + norm_eval=True, + pretrained=None, + init_cfg=None): """ Args: input_ch(int) : the number of input channel @@ -285,32 +343,35 @@ def __init__(self, spec_name, input_ch=3, out_features=None, self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) stage_specs = _STAGE_SPECS[spec_name] - stem_ch = stage_specs["stem"] - config_stage_ch = stage_specs["stage_conv_ch"] - config_concat_ch = stage_specs["stage_out_ch"] - block_per_stage = stage_specs["block_per_stage"] - layer_per_block = stage_specs["layer_per_block"] - SE = stage_specs["eSE"] - depthwise = stage_specs["dw"] + stem_ch = stage_specs['stem'] + config_stage_ch = stage_specs['stage_conv_ch'] + config_concat_ch = stage_specs['stage_out_ch'] + block_per_stage = stage_specs['block_per_stage'] + layer_per_block = stage_specs['layer_per_block'] + SE = stage_specs['eSE'] + depthwise = stage_specs['dw'] self._out_features = out_features # Stem module conv_type = dw_conv3x3 if depthwise else conv3x3 - stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2) - stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1) - stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2) - self.add_module("stem", nn.Sequential((OrderedDict(stem)))) + stem = conv3x3(input_ch, stem_ch[0], 'stem', '1', 2) + stem += conv_type(stem_ch[0], stem_ch[1], 'stem', '2', 1) + stem += conv_type(stem_ch[1], stem_ch[2], 'stem', '3', 2) + self.add_module('stem', nn.Sequential((OrderedDict(stem)))) current_stirde = 4 - self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde} - self._out_feature_channels = {"stem": stem_ch[2]} + self._out_feature_strides = { + 'stem': current_stirde, + 'stage2': current_stirde + } + self._out_feature_channels = {'stem': stem_ch[2]} stem_out_ch = [stem_ch[2]] in_ch_list = stem_out_ch + config_concat_ch[:-1] # OSA stages self.stage_names = [] for i in range(4): # num_stages - name = "stage%d" % (i + 2) # stage 2 ... stage 5 + name = 'stage%d' % (i + 2) # stage 2 ... stage 5 self.stage_names.append(name) self.add_module( name, @@ -328,7 +389,8 @@ def __init__(self, spec_name, input_ch=3, out_features=None, self._out_feature_channels[name] = config_concat_ch[i] if not i == 0: - self._out_feature_strides[name] = current_stirde = int(current_stirde * 2) + self._out_feature_strides[name] = current_stirde = int( + current_stirde * 2) # initialize weights # self._initialize_weights() @@ -341,8 +403,8 @@ def _initialize_weights(self): def forward(self, x): outputs = {} x = self.stem(x) - if "stem" in self._out_features: - outputs["stem"] = x + if 'stem' in self._out_features: + outputs['stem'] = x for name in self.stage_names: x = getattr(self, name)(x) if name in self._out_features: @@ -372,4 +434,4 @@ def train(self, mode=True): for m in self.modules(): # trick: eval have effect on BatchNorm only if isinstance(m, _BatchNorm): - m.eval() \ No newline at end of file + m.eval() diff --git a/projects/mmdet3d_plugin/models/dense_heads/__init__.py b/projects/mmdet3d_plugin/models/dense_heads/__init__.py index 9284477..ee90407 100755 --- a/projects/mmdet3d_plugin/models/dense_heads/__init__.py +++ b/projects/mmdet3d_plugin/models/dense_heads/__init__.py @@ -1,4 +1 @@ -from .dgcnn3d_head import DGCNN3DHead from .detr3d_head import Detr3DHead - -__all__ = ['DGCNN3DHead', 'Detr3DHead'] \ No newline at end of file diff --git a/projects/mmdet3d_plugin/models/dense_heads/detr3d_head.py b/projects/mmdet3d_plugin/models/dense_heads/detr3d_head.py old mode 100755 new mode 100644 index 39803a1..a5eca03 --- a/projects/mmdet3d_plugin/models/dense_heads/detr3d_head.py +++ b/projects/mmdet3d_plugin/models/dense_heads/detr3d_head.py @@ -1,22 +1,33 @@ import copy +from typing import Dict, List, Optional, Sequence, Tuple import torch import torch.nn as nn import torch.nn.functional as F -from mmcv.cnn import Linear, bias_init_with_prob -from mmcv.runner import force_fp32 - -from mmdet.core import (multi_apply, multi_apply, reduce_mean) -from mmdet.models.utils.transformer import inverse_sigmoid -from mmdet.models import HEADS +from mmcv.cnn import Linear +from mmdet3d.models.task_modules.builder import \ + build_bbox_coder # need to change +from mmdet3d.registry import MODELS from mmdet.models.dense_heads import DETRHead -from mmdet3d.core.bbox.coders import build_bbox_coder -from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox +from mmdet.models.layers import inverse_sigmoid +from mmdet.models.utils import multi_apply +from mmdet.registry import TASK_UTILS +from mmdet.structures import SampleList +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig, reduce_mean) +from mmengine.model import bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor +from projects.mmdet3d_plugin.models.task_modules.util import normalize_bbox -@HEADS.register_module() +# from projects.mmdet3d_plugin.models.utils.old_env import force_fp32 + + +@MODELS.register_module() class Detr3DHead(DETRHead): - """Head of Detr3D. + """Head of Detr3D. + Args: with_box_refine (bool): Whether to refine the reference points in the decoder. Defaults to False. @@ -24,37 +35,45 @@ class Detr3DHead(DETRHead): the outputs of encoder. transformer (obj:`ConfigDict`): ConfigDict is used for building the Encoder and Decoder. + bbox_coder (obj:`ConfigDict`): Configs to build the bbox coder + num_cls_fcs (int) : the number of layers in cls and reg branch + code_weights (List[double]) : loss weights of (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) + code_size (int) : size of code_weights """ - def __init__(self, - *args, - with_box_refine=False, - as_two_stage=False, - transformer=None, - bbox_coder=None, - num_cls_fcs=2, - code_weights=None, - **kwargs): + + def __init__( + self, + *args, + with_box_refine=False, + as_two_stage=False, + transformer=None, + bbox_coder=None, + num_cls_fcs=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + code_size=10, + **kwargs): self.with_box_refine = with_box_refine self.as_two_stage = as_two_stage if self.as_two_stage: transformer['as_two_stage'] = self.as_two_stage - if 'code_size' in kwargs: - self.code_size = kwargs['code_size'] - else: - self.code_size = 10 - if code_weights is not None: - self.code_weights = code_weights - else: - self.code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] - + self.code_size = code_size + self.code_weights = code_weights + self.bbox_coder = build_bbox_coder(bbox_coder) self.pc_range = self.bbox_coder.pc_range self.num_cls_fcs = num_cls_fcs - 1 - super(Detr3DHead, self).__init__( - *args, transformer=transformer, **kwargs) - self.code_weights = nn.Parameter(torch.tensor( - self.code_weights, requires_grad=False), requires_grad=False) - + super(Detr3DHead, self).__init__(*args, + transformer=transformer, + **kwargs) + # DETR sampling=False, so use PseudoSampler, format the result + sampler_cfg = dict(type='PseudoSampler') + self.sampler = TASK_UTILS.build(sampler_cfg) + + self.code_weights = nn.Parameter(torch.tensor(self.code_weights, + requires_grad=False), + requires_grad=False) + + # forward_train -> loss def _init_layers(self): """Initialize classification branch and regression branch of head.""" cls_branch = [] @@ -101,10 +120,12 @@ def init_weights(self): for m in self.cls_branches: nn.init.constant_(m[-1].bias, bias_init) - def forward(self, mlvl_feats, img_metas): + def forward(self, mlvl_feats: List[Tensor], img_metas: List[Dict], + **kwargs) -> Dict[str, Tensor]: """Forward function. + Args: - mlvl_feats (tuple[Tensor]): Features from the upstream + mlvl_feats (List[Tensor]): Features from the upstream network, each is a 5D-tensor with shape (B, N, C, H, W). Returns: @@ -112,18 +133,16 @@ def forward(self, mlvl_feats, img_metas): shape [nb_dec, bs, num_query, cls_out_channels]. Note \ cls_out_channels should includes background. all_bbox_preds (Tensor): Sigmoid outputs from the regression \ - head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ - Shape [nb_dec, bs, num_query, 9]. + head with normalized coordinate format (cx, cy, l, w, cz, h, sin(φ), cos(φ), vx, vy). \ + Shape [nb_dec, bs, num_query, 10]. """ - query_embeds = self.query_embedding.weight - hs, init_reference, inter_references = self.transformer( mlvl_feats, query_embeds, - reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 + reg_branches=self.reg_branches if self.with_box_refine else None, img_metas=img_metas, - ) + **kwargs) hs = hs.permute(0, 2, 1, 3) outputs_classes = [] outputs_coords = [] @@ -135,17 +154,23 @@ def forward(self, mlvl_feats, img_metas): reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.cls_branches[lvl](hs[lvl]) - tmp = self.reg_branches[lvl](hs[lvl]) - + tmp = self.reg_branches[lvl](hs[lvl]) # shape: ([B, num_q, 10]) # TODO: check the shape of reference assert reference.shape[-1] == 3 tmp[..., 0:2] += reference[..., 0:2] tmp[..., 0:2] = tmp[..., 0:2].sigmoid() tmp[..., 4:5] += reference[..., 2:3] tmp[..., 4:5] = tmp[..., 4:5].sigmoid() - tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]) - tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]) - tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] - self.pc_range[2]) + self.pc_range[2]) + + tmp[..., 0:1] = \ + tmp[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) \ + + self.pc_range[0] + tmp[..., 1:2] = \ + tmp[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) \ + + self.pc_range[1] + tmp[..., 4:5] = \ + tmp[..., 4:5] * (self.pc_range[5] - self.pc_range[2]) \ + + self.pc_range[2] # TODO: check if using sigmoid outputs_coord = tmp @@ -158,50 +183,36 @@ def forward(self, mlvl_feats, img_metas): 'all_cls_scores': outputs_classes, 'all_bbox_preds': outputs_coords, 'enc_cls_scores': None, - 'enc_bbox_preds': None, + 'enc_bbox_preds': None, } return outs - def _get_target_single(self, - cls_score, - bbox_pred, - gt_labels, - gt_bboxes, - gt_bboxes_ignore=None): - """"Compute regression and classification targets for one image. - Outputs from a single decoder layer of a single feature level are used. - Args: - cls_score (Tensor): Box score logits from a single decoder layer - for one image. Shape [num_query, cls_out_channels]. - bbox_pred (Tensor): Sigmoid outputs from a single decoder layer - for one image, with normalized coordinate (cx, cy, w, h) and - shape [num_query, 4]. - gt_bboxes (Tensor): Ground truth bboxes for one image with - shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels (Tensor): Ground truth class indices for one image - with shape (num_gts, ). - gt_bboxes_ignore (Tensor, optional): Bounding boxes - which can be ignored. Default None. - Returns: - tuple[Tensor]: a tuple containing the following for one image. - - labels (Tensor): Labels of each image. - - label_weights (Tensor]): Label weights of each image. - - bbox_targets (Tensor): BBox targets of each image. - - bbox_weights (Tensor): BBox weights of each image. - - pos_inds (Tensor): Sampled positive indices for each image. - - neg_inds (Tensor): Sampled negative indices for each image. - """ - - num_bboxes = bbox_pred.size(0) - # assigner and sampler - assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, - gt_labels, gt_bboxes_ignore) - sampling_result = self.sampler.sample(assign_result, bbox_pred, - gt_bboxes) + def _get_target_single( + self, + cls_score: Tensor, # [query, num_cls] + bbox_pred: Tensor, # [query, 10] + gt_instances_3d: InstanceList) -> Tuple[Tensor, ...]: + """Compute regression and classification targets for a single image.""" + # turn bottm center into gravity center + gt_bboxes = gt_instances_3d.bboxes_3d # [num_gt, 9] + gt_bboxes = torch.cat( + (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1) + + gt_labels = gt_instances_3d.labels_3d # [num_gt, num_cls] + # assigner and sampler: PseudoSampler + assign_result = self.assigner.assign(bbox_pred, + cls_score, + gt_bboxes, + gt_labels, + gt_bboxes_ignore=None) + sampling_result = self.sampler.sample( + assign_result, InstanceData(priors=bbox_pred), + InstanceData(bboxes_3d=gt_bboxes)) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds # label targets + num_bboxes = bbox_pred.size(0) labels = gt_bboxes.new_full((num_bboxes, ), self.num_classes, dtype=torch.long) @@ -209,36 +220,38 @@ def _get_target_single(self, label_weights = gt_bboxes.new_ones(num_bboxes) # bbox targets - bbox_targets = torch.zeros_like(bbox_pred)[..., :9] + # theta in gt_bbox here is still a single scalar + bbox_targets = torch.zeros_like(bbox_pred)[..., :self.code_size - 1] bbox_weights = torch.zeros_like(bbox_pred) + # only matched query will learn from bbox coord bbox_weights[pos_inds] = 1.0 - # DETR + # fix empty gt bug in multi gpu training + if sampling_result.pos_gt_bboxes.shape[0] == 0: + sampling_result.pos_gt_bboxes = \ + sampling_result.pos_gt_bboxes.reshape(0, self.code_size - 1) + bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes - return (labels, label_weights, bbox_targets, bbox_weights, - pos_inds, neg_inds) - - def get_targets(self, - cls_scores_list, - bbox_preds_list, - gt_bboxes_list, - gt_labels_list, - gt_bboxes_ignore_list=None): - """"Compute regression and classification targets for a batch image. - Outputs from a single decoder layer of a single feature level are used. + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + neg_inds) + + def get_targets( + self, #get_targets + batch_cls_scores: List[Tensor], # bs[num_q,num_cls] + batch_bbox_preds: List[Tensor], # bs[num_q,10] + batch_gt_instances_3d: InstanceList) ->...: + """"Compute regression and classification targets for a batch image for + a single decoder layer. + Args: - cls_scores_list (list[Tensor]): Box score logits from a single + batch_cls_scores (list[Tensor]): Box score logits from a single decoder layer for each image with shape [num_query, cls_out_channels]. - bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + batch_bbox_preds (list[Tensor]): Sigmoid outputs from a single decoder layer for each image, with normalized coordinate - (cx, cy, w, h) and shape [num_query, 4]. - gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image - with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - gt_bboxes_ignore_list (list[Tensor], optional): Bounding - boxes which can be ignored for each image. Default None. + (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) and shape [num_query, 10] + batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes_3d``、``labels_3d``. Returns: tuple: a tuple containing the following targets. - labels_list (list[Tensor]): Labels for all images. @@ -253,52 +266,44 @@ def get_targets(self, - num_total_neg (int): Number of negative samples in all \ images. """ - assert gt_bboxes_ignore_list is None, \ - 'Only supports for gt_bboxes_ignore setting to None.' - num_imgs = len(cls_scores_list) - gt_bboxes_ignore_list = [ - gt_bboxes_ignore_list for _ in range(num_imgs) - ] + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + pos_inds_list, neg_inds_list) = multi_apply(self._get_target_single, + batch_cls_scores, + batch_bbox_preds, + batch_gt_instances_3d) - (labels_list, label_weights_list, bbox_targets_list, - bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply( - self._get_target_single, cls_scores_list, bbox_preds_list, - gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list) num_total_pos = sum((inds.numel() for inds in pos_inds_list)) num_total_neg = sum((inds.numel() for inds in neg_inds_list)) return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) - def loss_single(self, - cls_scores, - bbox_preds, - gt_bboxes_list, - gt_labels_list, - gt_bboxes_ignore_list=None): + def loss_by_feat_single( + self, + batch_cls_scores: Tensor, # bs,num_q,num_cls + batch_bbox_preds: Tensor, # bs,num_q,10 + batch_gt_instances_3d: InstanceList) ->...: """"Loss function for outputs from a single decoder layer of a single feature level. + Args: - cls_scores (Tensor): Box score logits from a single decoder layer - for all images. Shape [bs, num_query, cls_out_channels]. - bbox_preds (Tensor): Sigmoid outputs from a single decoder layer - for all images, with normalized coordinate (cx, cy, w, h) and - shape [bs, num_query, 4]. - gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image - with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - gt_bboxes_ignore_list (list[Tensor], optional): Bounding - boxes which can be ignored for each image. Default None. + batch_cls_scores (Tensor): Box score logits from a single + decoder layer for batched images with shape [num_query, + cls_out_channels]. + batch_bbox_preds (Tensor): Sigmoid outputs from a single + decoder layer for batched images, with normalized coordinate + (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) and shape [num_query, 10] + batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of + gt_instance_3d. It usually includes ``bboxes_3d``、``labels_3d``. Returns: - dict[str, Tensor]: A dictionary of loss components for outputs from + tulple(Tensor, Tensor): cls and reg loss for outputs from a single decoder layer. """ - num_imgs = cls_scores.size(0) - cls_scores_list = [cls_scores[i] for i in range(num_imgs)] - bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + batch_size = batch_cls_scores.size(0) # batch size + cls_scores_list = [batch_cls_scores[i] for i in range(batch_size)] + bbox_preds_list = [batch_bbox_preds[i] for i in range(batch_size)] cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, - gt_bboxes_list, gt_labels_list, - gt_bboxes_ignore_list) + batch_gt_instances_3d) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets labels = torch.cat(labels_list, 0) @@ -307,105 +312,87 @@ def loss_single(self, bbox_weights = torch.cat(bbox_weights_list, 0) # classification loss - cls_scores = cls_scores.reshape(-1, self.cls_out_channels) + batch_cls_scores = batch_cls_scores.reshape(-1, self.cls_out_channels) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = num_total_pos * 1.0 + \ num_total_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( - cls_scores.new_tensor([cls_avg_factor])) + batch_cls_scores.new_tensor([cls_avg_factor])) cls_avg_factor = max(cls_avg_factor, 1) - loss_cls = self.loss_cls( - cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + loss_cls = self.loss_cls(batch_cls_scores, + labels, + label_weights, + avg_factor=cls_avg_factor) - # Compute the average number of gt boxes accross all gpus, for + # Compute the average number of gt boxes across all gpus, for # normalization purposes num_total_pos = loss_cls.new_tensor([num_total_pos]) num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() # regression L1 loss - bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) + batch_bbox_preds = batch_bbox_preds.reshape(-1, + batch_bbox_preds.size(-1)) normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range) + # neg_query is all 0, log(0) is NaN isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) bbox_weights = bbox_weights * self.code_weights loss_bbox = self.loss_bbox( - bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan, :10], bbox_weights[isnotnan, :10], avg_factor=num_total_pos) + batch_bbox_preds[isnotnan, :self.code_size], + normalized_bbox_targets[isnotnan, :self.code_size], + bbox_weights[isnotnan, :self.code_size], + avg_factor=num_total_pos) loss_cls = torch.nan_to_num(loss_cls) loss_bbox = torch.nan_to_num(loss_bbox) return loss_cls, loss_bbox - - @force_fp32(apply_to=('preds_dicts')) - def loss(self, - gt_bboxes_list, - gt_labels_list, - preds_dicts, - gt_bboxes_ignore=None): - """"Loss function. + + # original loss() + # @force_fp32(apply_to=('preds_dicts')) + def loss_by_feat( + self, + batch_gt_instances_3d: InstanceList, + preds_dicts: Dict[str, Tensor], + batch_gt_instances_3d_ignore: OptInstanceList = None) -> Dict: + """Compute loss of the head. + Args: - - gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image - with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - preds_dicts: - all_cls_scores (Tensor): Classification score of all - decoder layers, has shape - [nb_dec, bs, num_query, cls_out_channels]. - all_bbox_preds (Tensor): Sigmoid regression - outputs of all decode layers. Each is a 4D-tensor with - normalized coordinate format (cx, cy, w, h) and shape - [nb_dec, bs, num_query, 4]. - enc_cls_scores (Tensor): Classification scores of - points on encode feature map , has shape - (N, h*w, num_classes). Only be passed when as_two_stage is - True, otherwise is None. - enc_bbox_preds (Tensor): Regression results of each points - on the encode feature map, has shape (N, h*w, 4). Only be - passed when as_two_stage is True, otherwise is None. - gt_bboxes_ignore (list[Tensor], optional): Bounding boxes - which can be ignored for each image. Default None. + batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of + gt_instance_3d. It usually includes ``bboxes_3d``、` + `labels_3d``、``depths``、``centers_2d`` and attributes. + gt_instance. It usually includes ``bboxes``、``labels``. + batch_gt_instances_3d_ignore (list[:obj:`InstanceData`], Optional): + NOT supported. + Defaults to None. + Returns: dict[str, Tensor]: A dictionary of loss components. """ - assert gt_bboxes_ignore is None, \ + assert batch_gt_instances_3d_ignore is None, \ f'{self.__class__.__name__} only supports ' \ - f'for gt_bboxes_ignore setting to None.' - - all_cls_scores = preds_dicts['all_cls_scores'] - all_bbox_preds = preds_dicts['all_bbox_preds'] + f'for batch_gt_instances_3d_ignore setting to None.' + all_cls_scores = preds_dicts[ + 'all_cls_scores'] # num_dec,bs,num_q,num_cls + all_bbox_preds = preds_dicts['all_bbox_preds'] # num_dec,bs,num_q,10 enc_cls_scores = preds_dicts['enc_cls_scores'] enc_bbox_preds = preds_dicts['enc_bbox_preds'] + # calculate loss for each decoder layer num_dec_layers = len(all_cls_scores) - device = gt_labels_list[0].device - gt_bboxes_list = [torch.cat( - (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), - dim=1).to(device) for gt_bboxes in gt_bboxes_list] - - all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] - all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] - all_gt_bboxes_ignore_list = [ - gt_bboxes_ignore for _ in range(num_dec_layers) + batch_gt_instances_3d_list = [ + batch_gt_instances_3d for _ in range(num_dec_layers) ] - - losses_cls, losses_bbox = multi_apply( - self.loss_single, all_cls_scores, all_bbox_preds, - all_gt_bboxes_list, all_gt_labels_list, - all_gt_bboxes_ignore_list) + losses_cls, losses_bbox = multi_apply(self.loss_by_feat_single, + all_cls_scores, all_bbox_preds, + batch_gt_instances_3d_list) loss_dict = dict() # loss of proposal generated from encode feature map. if enc_cls_scores is not None: - binary_labels_list = [ - torch.zeros_like(gt_labels_list[i]) - for i in range(len(all_gt_labels_list)) - ] enc_loss_cls, enc_losses_bbox = \ - self.loss_single(enc_cls_scores, enc_bbox_preds, - gt_bboxes_list, binary_labels_list, gt_bboxes_ignore) + self.loss_by_feat_single(enc_cls_scores, enc_bbox_preds, batch_gt_instances_3d_list) loss_dict['enc_loss_cls'] = enc_loss_cls loss_dict['enc_loss_bbox'] = enc_losses_bbox @@ -415,31 +402,56 @@ def loss(self, # loss from other decoder layers num_dec_layer = 0 - for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], - losses_bbox[:-1]): + for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]): loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i num_dec_layer += 1 return loss_dict - @force_fp32(apply_to=('preds_dicts')) - def get_bboxes(self, preds_dicts, img_metas, rescale=False): - """Generate bboxes from bbox head predictions. + # @force_fp32(apply_to=('preds_dicts')) + def predict_by_feat(self, + preds_dicts, + img_metas, + rescale=False) -> InstanceList: + """Transform network output for a batch into bbox predictions. + Args: - preds_dicts (tuple[list[dict]]): Prediction results. - img_metas (list[dict]): Point cloud and image's meta info. + preds_dicts (Dict[str, Tensor]): + -all_cls_scores (Tensor): Outputs from the classification head, + shape [nb_dec, bs, num_query, cls_out_channels]. Note + cls_out_channels should includes background. + -all_bbox_preds (Tensor): Sigmoid outputs from the regression + head with normalized coordinate format (cx, cy, l, w, cz, h, rot_sine, rot_cosine, v_x, v_y). + Shape [nb_dec, bs, num_query, 10]. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + Returns: - list[dict]: Decoded bbox, scores and labels after nms. + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores_3d (Tensor): Classification scores, has a shape + (num_instance, ) + - labels_3d (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes_3d (Tensor): Contains a tensor with shape + (num_instances, C), where C >= 7. """ + # sinθ & cosθ ---> θ preds_dicts = self.bbox_coder.decode(preds_dicts) - num_samples = len(preds_dicts) + num_samples = len(preds_dicts) # batch size ret_list = [] for i in range(num_samples): + results = InstanceData() preds = preds_dicts[i] bboxes = preds['bboxes'] bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 - bboxes = img_metas[i]['box_type_3d'](bboxes, 9) - scores = preds['scores'] - labels = preds['labels'] - ret_list.append([bboxes, scores, labels]) + bboxes = img_metas[i]['box_type_3d'](bboxes, self.code_size - 1) + + results.bboxes_3d = bboxes + results.scores_3d = preds['scores'] + results.labels_3d = preds['labels'] + ret_list.append(results) return ret_list diff --git a/projects/mmdet3d_plugin/models/dense_heads/dgcnn3d_head.py b/projects/mmdet3d_plugin/models/dense_heads/dgcnn3d_head.py deleted file mode 100755 index 1f6e7e9..0000000 --- a/projects/mmdet3d_plugin/models/dense_heads/dgcnn3d_head.py +++ /dev/null @@ -1,484 +0,0 @@ -import copy - -import torch -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import Linear, bias_init_with_prob -from mmcv.runner import force_fp32, auto_fp16 - -from mmdet.core import multi_apply, reduce_mean -from mmdet.models.utils.transformer import inverse_sigmoid -from mmdet.models import HEADS -from mmdet.models.dense_heads import DETRHead -from mmdet3d.core.bbox.coders import build_bbox_coder -from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox - - -@HEADS.register_module() -class DGCNN3DHead(DETRHead): - """Head of DeformDETR3D. - Args: - with_box_refine (bool): Whether to refine the reference points - in the decoder. Defaults to False. - as_two_stage (bool) : Whether to generate the proposal from - the outputs of encoder. - transformer (obj:`ConfigDict`): ConfigDict is used for building - the Encoder and Decoder. - """ - def __init__(self, - *args, - with_box_refine=False, - as_two_stage=False, - transformer=None, - bbox_coder=None, - num_cls_fcs=2, - **kwargs): - self.with_box_refine = with_box_refine - self.as_two_stage = as_two_stage - if self.as_two_stage: - transformer['as_two_stage'] = self.as_two_stage - if 'code_size' in kwargs: - self.code_size = kwargs['code_size'] - else: - self.code_size = 10 - self.bbox_coder = build_bbox_coder(bbox_coder) - self.pc_range = self.bbox_coder.pc_range - self.voxel_size = self.bbox_coder.voxel_size - - self.bev_shape = (int((self.pc_range[3] - self.pc_range[0]) / self.voxel_size[0]), - int((self.pc_range[4] - self.pc_range[1]) / self.voxel_size[1])) - - self.num_cls_fcs = num_cls_fcs - 1 - super(DGCNN3DHead, self).__init__( - *args, transformer=transformer, **kwargs) - - - def _init_layers(self): - """Initialize classification branch and regression branch of head.""" - cls_branch = [] - for _ in range(self.num_reg_fcs): - cls_branch.append(Linear(self.embed_dims, self.embed_dims)) - cls_branch.append(nn.LayerNorm(self.embed_dims)) - cls_branch.append(nn.ReLU(inplace=True)) - cls_branch.append(Linear(self.embed_dims, self.cls_out_channels)) - fc_cls = nn.Sequential(*cls_branch) - - reg_branch = [] - for _ in range(self.num_reg_fcs): - reg_branch.append(Linear(self.embed_dims, self.embed_dims)) - # reg_branch.append(nn.LayerNorm(self.embed_dims)) - reg_branch.append(nn.ReLU()) - reg_branch.append(Linear(self.embed_dims, self.code_size)) - reg_branch = nn.Sequential(*reg_branch) - - def _get_clones(module, N): - return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) - - # last reg_branch is used to generate proposal from - # encode feature map when as_two_stage is True. - num_pred = (self.transformer.decoder.num_layers + 1) if \ - self.as_two_stage else self.transformer.decoder.num_layers - - if self.with_box_refine: - self.cls_branches = _get_clones(fc_cls, num_pred) - self.reg_branches = _get_clones(reg_branch, num_pred) - else: - self.cls_branches = nn.ModuleList( - [fc_cls for _ in range(num_pred)]) - self.reg_branches = nn.ModuleList( - [reg_branch for _ in range(num_pred)]) - - if not self.as_two_stage: - self.query_embedding = nn.Embedding(self.num_query, - self.embed_dims * 2) - - def init_weights(self): - """Initialize weights of the head.""" - self.transformer.init_weights() - if self.loss_cls.use_sigmoid: - bias_init = bias_init_with_prob(0.01) - for m in self.cls_branches: - nn.init.constant_(m[-1].bias, bias_init) - if self.as_two_stage: - for m in self.reg_branches: - nn.init.constant_(m[-1].bias.data[2:], 0.0) - - def forward(self, mlvl_feats): - """Forward function. - Args: - mlvl_feats (tuple[Tensor]): Features from the upstream - network, each is a 4D-tensor with shape - (N, C, H, W). - Returns: - all_cls_scores (Tensor): Outputs from the classification head, \ - shape [nb_dec, bs, num_query, cls_out_channels]. Note \ - cls_out_channels should includes background. - all_bbox_preds (Tensor): Sigmoid outputs from the regression \ - head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ - Shape [nb_dec, bs, num_query, 9]. - enc_outputs_class (Tensor): The score of each point on encode \ - feature map, has shape (N, h*w, num_class). Only when \ - as_two_stage is Ture it would be returned, otherwise \ - `None` would be returned. - enc_outputs_coord (Tensor): The proposal generate from the \ - encode feature map, has shape (N, h*w, 4). Only when \ - as_two_stage is Ture it would be returned, otherwise \ - `None` would be returned. - """ - - batch_size = mlvl_feats[0].size(0) - input_img_h, input_img_w = self.bev_shape - img_masks = mlvl_feats[0].new_zeros( - (batch_size, input_img_h, input_img_w)) - - mlvl_masks = [] - mlvl_positional_encodings = [] - for feat in mlvl_feats: - mlvl_masks.append( - F.interpolate(img_masks[None], - size=feat.shape[-2:]).to(torch.bool).squeeze(0)) - mlvl_positional_encodings.append( - self.positional_encoding(mlvl_masks[-1])) - - query_embeds = None - if not self.as_two_stage: - query_embeds = self.query_embedding.weight - hs, init_reference, inter_references, \ - enc_outputs_class, enc_outputs_coord = self.transformer( - mlvl_feats, - mlvl_masks, - query_embeds, - mlvl_positional_encodings, - reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 - cls_branches=self.cls_branches if self.as_two_stage else None # noqa:E501 - ) - hs = hs.permute(0, 2, 1, 3) - outputs_classes = [] - outputs_coords = [] - - for lvl in range(hs.shape[0]): - if lvl == 0: - reference = init_reference - else: - reference = inter_references[lvl - 1] - reference = inverse_sigmoid(reference) - outputs_class = self.cls_branches[lvl](hs[lvl]) - tmp = self.reg_branches[lvl](hs[lvl]) - - if reference.shape[-1] == 4: - tmp += reference - else: - assert reference.shape[-1] == 2 - tmp[..., 0:2] += reference - tmp[..., 0:2] = tmp[..., 0:2].sigmoid() - tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]) - tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]) - - if tmp.size(-1) > 8: - outputs_coord = torch.cat((tmp[..., :6], tmp[..., 6:8], tmp[..., 8:]), -1) - else: - outputs_coord = torch.cat((tmp[..., :6], tmp[..., 6:8]), -1) - outputs_classes.append(outputs_class) - outputs_coords.append(outputs_coord) - - outputs_classes = torch.stack(outputs_classes) - outputs_coords = torch.stack(outputs_coords) - if self.as_two_stage: - outs = { - 'all_cls_scores': outputs_classes, - 'all_bbox_preds': outputs_coords, - 'enc_cls_scores': enc_outputs_class, - 'enc_bbox_preds': enc_outputs_coord.sigmoid(), - } - else: - outs = { - 'all_cls_scores': outputs_classes, - 'all_bbox_preds': outputs_coords, - 'enc_cls_scores': None, - 'enc_bbox_preds': None, - } - return outs - - def _get_target_single(self, - cls_score, - bbox_pred, - gt_bboxes, - gt_labels, - gt_bboxes_ignore=None): - """"Compute regression and classification targets for one image. - Outputs from a single decoder layer of a single feature level are used. - Args: - cls_score (Tensor): Box score logits from a single decoder layer - for one image. Shape [num_query, cls_out_channels]. - bbox_pred (Tensor): Sigmoid outputs from a single decoder layer - for one image, with normalized coordinate (cx, cy, w, h) and - shape [num_query, 4]. - gt_bboxes (Tensor): Ground truth bboxes for one image with - shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels (Tensor): Ground truth class indices for one image - with shape (num_gts, ). - gt_bboxes_ignore (Tensor, optional): Bounding boxes - which can be ignored. Default None. - Returns: - tuple[Tensor]: a tuple containing the following for one image. - - labels (Tensor): Labels of each image. - - label_weights (Tensor]): Label weights of each image. - - bbox_targets (Tensor): BBox targets of each image. - - bbox_weights (Tensor): BBox weights of each image. - - pos_inds (Tensor): Sampled positive indices for each image. - - neg_inds (Tensor): Sampled negative indices for each image. - """ - - num_bboxes = bbox_pred.size(0) - # assigner and sampler - assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, - gt_labels, gt_bboxes_ignore) - sampling_result = self.sampler.sample(assign_result, bbox_pred, - gt_bboxes) - pos_inds = sampling_result.pos_inds - neg_inds = sampling_result.neg_inds - - # label targets - labels = gt_bboxes.new_full((num_bboxes, ), - self.num_classes, - dtype=torch.long) - labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] - label_weights = gt_bboxes.new_ones(num_bboxes) - - # bbox targets - bbox_targets = torch.zeros_like(bbox_pred)[..., :self.code_size-1] - bbox_weights = torch.zeros_like(bbox_pred) - bbox_weights[pos_inds] = 1.0 - - # DETR - bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes - return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, - neg_inds) - - def get_targets(self, - cls_scores_list, - bbox_preds_list, - gt_bboxes_list, - gt_labels_list, - gt_bboxes_ignore_list=None): - """"Compute regression and classification targets for a batch image. - Outputs from a single decoder layer of a single feature level are used. - Args: - cls_scores_list (list[Tensor]): Box score logits from a single - decoder layer for each image with shape [num_query, - cls_out_channels]. - bbox_preds_list (list[Tensor]): Sigmoid outputs from a single - decoder layer for each image, with normalized coordinate - (cx, cy, w, h) and shape [num_query, 4]. - gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image - with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - gt_bboxes_ignore_list (list[Tensor], optional): Bounding - boxes which can be ignored for each image. Default None. - Returns: - tuple: a tuple containing the following targets. - - labels_list (list[Tensor]): Labels for all images. - - label_weights_list (list[Tensor]): Label weights for all \ - images. - - bbox_targets_list (list[Tensor]): BBox targets for all \ - images. - - bbox_weights_list (list[Tensor]): BBox weights for all \ - images. - - num_total_pos (int): Number of positive samples in all \ - images. - - num_total_neg (int): Number of negative samples in all \ - images. - """ - assert gt_bboxes_ignore_list is None, \ - 'Only supports for gt_bboxes_ignore setting to None.' - num_imgs = len(cls_scores_list) - gt_bboxes_ignore_list = [ - gt_bboxes_ignore_list for _ in range(num_imgs) - ] - - (labels_list, label_weights_list, bbox_targets_list, - bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply( - self._get_target_single, cls_scores_list, bbox_preds_list, - gt_bboxes_list, gt_labels_list, gt_bboxes_ignore_list) - num_total_pos = sum((inds.numel() for inds in pos_inds_list)) - num_total_neg = sum((inds.numel() for inds in neg_inds_list)) - return (labels_list, label_weights_list, bbox_targets_list, - bbox_weights_list, num_total_pos, num_total_neg) - - def loss_single(self, - cls_scores, - bbox_preds, - gt_bboxes_list, - gt_labels_list, - gt_bboxes_ignore_list=None): - """"Loss function for outputs from a single decoder layer of a single - feature level. - Args: - cls_scores (Tensor): Box score logits from a single decoder layer - for all images. Shape [bs, num_query, cls_out_channels]. - bbox_preds (Tensor): Sigmoid outputs from a single decoder layer - for all images, with normalized coordinate (cx, cy, w, h) and - shape [bs, num_query, 4]. - gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image - with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - gt_bboxes_ignore_list (list[Tensor], optional): Bounding - boxes which can be ignored for each image. Default None. - Returns: - dict[str, Tensor]: A dictionary of loss components for outputs from - a single decoder layer. - """ - num_imgs = cls_scores.size(0) - cls_scores_list = [cls_scores[i] for i in range(num_imgs)] - bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] - cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, - gt_bboxes_list, gt_labels_list, gt_bboxes_ignore_list) - (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, - num_total_pos, num_total_neg) = cls_reg_targets - labels = torch.cat(labels_list, 0) - label_weights = torch.cat(label_weights_list, 0) - bbox_targets = torch.cat(bbox_targets_list, 0) - bbox_weights = torch.cat(bbox_weights_list, 0) - - # classification loss - cls_scores = cls_scores.reshape(-1, self.cls_out_channels) - # construct weighted avg_factor to match with the official DETR repo - cls_avg_factor = num_total_pos * 1.0 + \ - num_total_neg * self.bg_cls_weight - if self.sync_cls_avg_factor: - cls_avg_factor = reduce_mean( - cls_scores.new_tensor([cls_avg_factor])) - - cls_avg_factor = max(cls_avg_factor, 1) - loss_cls = self.loss_cls( - cls_scores, labels, label_weights, avg_factor=cls_avg_factor) - - # Compute the average number of gt boxes accross all gpus, for - # normalization purposes - num_total_pos = loss_cls.new_tensor([num_total_pos]) - num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() - - # regression L1 loss - bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) - normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range) - isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) - loss_bbox = self.loss_bbox( - bbox_preds[isnotnan, :8], normalized_bbox_targets[isnotnan, :8], bbox_weights[isnotnan, :8], avg_factor=num_total_pos) - if self.code_size > 8: - loss_bbox_vel = self.loss_bbox( - bbox_preds[isnotnan, 8:], normalized_bbox_targets[isnotnan, 8:], bbox_weights[isnotnan, 8:], avg_factor=num_total_pos) - loss_bbox = loss_bbox + loss_bbox_vel * 0.2 - - return loss_cls, loss_bbox - - @force_fp32(apply_to=('preds_dicts')) - def loss(self, - gt_bboxes_list, - gt_labels_list, - preds_dicts, - gt_bboxes_ignore=None): - """"Loss function. - Args: - - gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image - with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - preds_dicts: - all_cls_scores (Tensor): Classification score of all - decoder layers, has shape - [nb_dec, bs, num_query, cls_out_channels]. - all_bbox_preds (Tensor): Sigmoid regression - outputs of all decode layers. Each is a 4D-tensor with - normalized coordinate format (cx, cy, w, h) and shape - [nb_dec, bs, num_query, 4]. - enc_cls_scores (Tensor): Classification scores of - points on encode feature map , has shape - (N, h*w, num_classes). Only be passed when as_two_stage is - True, otherwise is None. - enc_bbox_preds (Tensor): Regression results of each points - on the encode feature map, has shape (N, h*w, 4). Only be - passed when as_two_stage is True, otherwise is None. - gt_bboxes_ignore (list[Tensor], optional): Bounding boxes - which can be ignored for each image. Default None. - Returns: - dict[str, Tensor]: A dictionary of loss components. - """ - assert gt_bboxes_ignore is None, \ - f'{self.__class__.__name__} only supports ' \ - f'for gt_bboxes_ignore setting to None.' - - all_cls_scores = preds_dicts['all_cls_scores'] - all_bbox_preds = preds_dicts['all_bbox_preds'] - enc_cls_scores = preds_dicts['enc_cls_scores'] - enc_bbox_preds = preds_dicts['enc_bbox_preds'] - - num_dec_layers = len(all_cls_scores) - device = gt_labels_list[0].device - gt_bboxes_list = [torch.cat( - (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), - dim=1).to(device) for gt_bboxes in gt_bboxes_list] - - all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] - all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] - all_gt_bboxes_ignore_list = [ - gt_bboxes_ignore for _ in range(num_dec_layers) - ] - - losses_cls, losses_bbox = multi_apply( - self.loss_single, all_cls_scores, all_bbox_preds, - all_gt_bboxes_list, all_gt_labels_list, - all_gt_bboxes_ignore_list) - - loss_dict = dict() - # loss of proposal generated from encode feature map. - if enc_cls_scores is not None: - binary_labels_list = [ - torch.zeros_like(gt_labels_list[i]) - for i in range(len(all_gt_labels_list)) - ] - enc_loss_cls, enc_losses_bbox = \ - self.loss_single(enc_cls_scores, enc_bbox_preds, - gt_bboxes_list, binary_labels_list, gt_bboxes_ignore) - loss_dict['enc_loss_cls'] = enc_loss_cls - loss_dict['enc_loss_bbox'] = enc_losses_bbox - - # loss from the last decoder layer - loss_dict['loss_cls'] = losses_cls[-1] - loss_dict['loss_bbox'] = losses_bbox[-1] - - # loss from other decoder layers - num_dec_layer = 0 - for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], - losses_bbox[:-1]): - loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i - loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i - num_dec_layer += 1 - return loss_dict - - @force_fp32(apply_to=('preds_dicts')) - def get_bboxes(self, preds_dicts, img_metas, rescale=False): - """Generate bboxes from bbox head predictions. - Args: - preds_dicts (tuple[list[dict]]): Prediction results. - img_metas (list[dict]): Point cloud and image's meta info. - Returns: - list[dict]: Decoded bbox, scores and labels after nms. - """ - preds_dicts = self.bbox_coder.decode(preds_dicts) - num_samples = len(preds_dicts) - ret_list = [] - for i in range(num_samples): - preds = preds_dicts[i] - bboxes = preds['bboxes'] - bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 - if bboxes.size(-1) == 9: - bboxes = img_metas[i]['box_type_3d'](bboxes, 9) - else: - bboxes = img_metas[i]['box_type_3d'](bboxes, 7) - scores = preds['scores'] - labels = preds['labels'] - ret_list.append([bboxes, scores, labels]) - return ret_list \ No newline at end of file diff --git a/projects/mmdet3d_plugin/models/detectors/__init__.py b/projects/mmdet3d_plugin/models/detectors/__init__.py index ec02344..9fb9fa3 100755 --- a/projects/mmdet3d_plugin/models/detectors/__init__.py +++ b/projects/mmdet3d_plugin/models/detectors/__init__.py @@ -1,4 +1,4 @@ -from .obj_dgcnn import ObjDGCNN from .detr3d import Detr3D +from .detr3d_old import Detr3D_old -__all__ = [ 'ObjDGCNN', 'Detr3D'] +# from .visualizer_zlt import * diff --git a/projects/mmdet3d_plugin/models/detectors/detr3d.py b/projects/mmdet3d_plugin/models/detectors/detr3d.py old mode 100755 new mode 100644 index 5c84fb1..63613b2 --- a/projects/mmdet3d_plugin/models/detectors/detr3d.py +++ b/projects/mmdet3d_plugin/models/detectors/detr3d.py @@ -1,48 +1,92 @@ -import torch +import os +import time +from typing import Dict, List, Optional, Sequence -from mmcv.runner import force_fp32, auto_fp16 -from mmdet.models import DETECTORS -from mmdet3d.core import bbox3d2result +import numpy as np +import torch from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector +from mmdet3d.registry import MODELS +from mmdet3d.structures import Det3DDataSample +from torch import Tensor + from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask -@DETECTORS.register_module() +# import torchvision.utils as vutils +# from mmdet3d.structures.bbox_3d.utils import get_lidar2img +# from .visualizer_zlt import * +# import cv2 +@MODELS.register_module() class Detr3D(MVXTwoStageDetector): - """Detr3D.""" + """DETR3D: 3D Object Detection from Multi-view Images via 3D-to-2D Queries + + Args: + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`Det3DDataPreprocessor`. Defaults to None. + use_grid_mask (bool) : Data augmentation. Whether to mask out some + grids during extract_img_feat. Defaults to False. + img_backbone (dict, optional): Backbone of extracting + images feature. Defaults to None. + img_neck (dict, optional): Neck of extracting + image features. Defaults to None. + pts_bbox_head (dict, optional): Bboxes head of + detr3d. Defaults to None. + train_cfg (dict, optional): Train config of model. + Defaults to None. + test_cfg (dict, optional): Train config of model. + Defaults to None. + init_cfg (dict, optional): Initialize config of + model. Defaults to None. + """ def __init__(self, + data_preprocessor=None, use_grid_mask=False, - pts_voxel_layer=None, - pts_voxel_encoder=None, - pts_middle_encoder=None, - pts_fusion_layer=None, img_backbone=None, - pts_backbone=None, img_neck=None, - pts_neck=None, pts_bbox_head=None, - img_roi_head=None, - img_rpn_head=None, train_cfg=None, test_cfg=None, - pretrained=None): - super(Detr3D, - self).__init__(pts_voxel_layer, pts_voxel_encoder, - pts_middle_encoder, pts_fusion_layer, - img_backbone, pts_backbone, img_neck, pts_neck, - pts_bbox_head, img_roi_head, img_rpn_head, - train_cfg, test_cfg, pretrained) - self.grid_mask = GridMask(True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7) + pretrained=None, + debug_name=None, + gtvis_range=[0, 105], + vis_count=None): + super(Detr3D, self).__init__(img_backbone=img_backbone, + img_neck=img_neck, + pts_bbox_head=pts_bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor) + self.grid_mask = GridMask(True, + True, + rotate=1, + offset=False, + ratio=0.5, + mode=1, + prob=0.7) self.use_grid_mask = use_grid_mask + self.debug_name = debug_name + self.gtvis_range = gtvis_range + self.vis_count = vis_count + + def extract_img_feat(self, img: Tensor, + batch_input_metas: List[dict]) -> List[Tensor]: + """Extract features from images. + + Args: + img (tensor): Batched multi-view image tensor with shape (B, N, C, H, W). + batch_input_metas (list[dict]): Meta information of multiple inputs + in a batch. + + Returns: + list[tensor]: multi-level image features. + """ - def extract_img_feat(self, img, img_metas): - """Extract features of images.""" B = img.size(0) if img is not None: - input_shape = img.shape[-2:] + input_shape = img.shape[-2:] # bs nchw # update real input shape of each single img - for img_meta in img_metas: + for img_meta in batch_input_metas: img_meta.update(input_shape=input_shape) if img.dim() == 5 and img.size(0) == 1: @@ -51,7 +95,7 @@ def extract_img_feat(self, img, img_metas): B, N, C, H, W = img.size() img = img.view(B * N, C, H, W) if self.use_grid_mask: - img = self.grid_mask(img) + img = self.grid_mask(img) # mask out some grids img_feats = self.img_backbone(img) if isinstance(img_feats, dict): img_feats = list(img_feats.values()) @@ -59,158 +103,142 @@ def extract_img_feat(self, img, img_metas): return None if self.with_img_neck: img_feats = self.img_neck(img_feats) + img_feats_reshaped = [] for img_feat in img_feats: BN, C, H, W = img_feat.size() img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W)) return img_feats_reshaped - @auto_fp16(apply_to=('img'), out_fp32=True) - def extract_feat(self, img, img_metas): - """Extract features from images and points.""" - img_feats = self.extract_img_feat(img, img_metas) + def extract_feat(self, batch_inputs_dict: Dict, + batch_input_metas: List[dict]) -> List[Tensor]: + """Extract features from images. + + Refer to self.extract_img_feat() + """ + imgs = batch_inputs_dict.get('imgs', None) + img_feats = self.extract_img_feat(imgs, batch_input_metas) return img_feats - def forward_pts_train(self, - pts_feats, - gt_bboxes_3d, - gt_labels_3d, - img_metas, - gt_bboxes_ignore=None): - """Forward function for point cloud branch. + def _forward(self): + raise NotImplementedError(f'tensor mode is yet to add') + + # original forward_train + def loss(self, batch_inputs_dict: Dict[List, Tensor], + batch_data_samples: List[Det3DDataSample], + **kwargs) -> List[Det3DDataSample]: + """ Args: - pts_feats (list[torch.Tensor]): Features of point cloud branch - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth - boxes for each sample. - gt_labels_3d (list[torch.Tensor]): Ground truth labels for - boxes of each sampole - img_metas (list[dict]): Meta information of samples. - gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth - boxes to be ignored. Defaults to None. + batch_inputs_dict (dict): The model input dict which include + `imgs` keys. + - imgs (torch.Tensor): Tensor of batched multi-view images, has shape (B, N, C, H ,W) + batch_data_samples (List[:obj:`Det3DDataSample`]): The Data Samples. It usually includes information such as `gt_instance_3d`, . + Returns: - dict: Losses of each branch. + dict[str, Tensor]: A dictionary of loss components. + """ - outs = self.pts_bbox_head(pts_feats, img_metas) - loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] - losses = self.pts_bbox_head.loss(*loss_inputs) - return losses - - @force_fp32(apply_to=('img', 'points')) - def forward(self, return_loss=True, **kwargs): - """Calls either forward_train or forward_test depending on whether - return_loss=True. - Note this setting will change the expected inputs. When - `return_loss=True`, img and img_metas are single-nested (i.e. - torch.Tensor and list[dict]), and when `resturn_loss=False`, img and - img_metas should be double nested (i.e. list[torch.Tensor], - list[list[dict]]), with the outer list indicating test time - augmentations. + batch_input_metas = [item.metainfo for item in batch_data_samples] + batch_input_metas = self.add_lidar2img(batch_input_metas) + img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas) + outs = self.pts_bbox_head(img_feats, batch_input_metas, **kwargs) + + batch_gt_instances_3d = [ + item.gt_instances_3d for item in batch_data_samples + ] + loss_inputs = [batch_gt_instances_3d, outs] + losses_pts = self.pts_bbox_head.loss_by_feat(*loss_inputs) + + return losses_pts + + # original simple_test + def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]], + batch_data_samples: List[Det3DDataSample], + **kwargs) -> List[Det3DDataSample]: + """Forward of testing. + + Args: + batch_inputs_dict (dict): The model input dict which include + `imgs` keys. + + - imgs (torch.Tensor): Tensor of batched multi-view images, has shape (B, N, C, H ,W) + batch_data_samples (List[:obj:`Det3DDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance_3d`. + + Returns: + list[:obj:`Det3DDataSample`]: Detection results of the + input sample. Each Det3DDataSample usually contain + 'pred_instances_3d'. And the ``pred_instances_3d`` usually + contains following keys. + + - scores_3d (Tensor): Classification scores, has a shape + (num_instances, ) + - labels_3d (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes, + contains a tensor with shape (num_instances, 9). """ - if return_loss: - return self.forward_train(**kwargs) - else: - return self.forward_test(**kwargs) - - def forward_train(self, - points=None, - img_metas=None, - gt_bboxes_3d=None, - gt_labels_3d=None, - gt_labels=None, - gt_bboxes=None, - img=None, - proposals=None, - gt_bboxes_ignore=None, - img_depth=None, - img_mask=None): - """Forward training function. + batch_input_metas = [item.metainfo for item in batch_data_samples] + batch_input_metas = self.add_lidar2img(batch_input_metas) + img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas) + outs = self.pts_bbox_head(img_feats, batch_input_metas) + + results_list_3d = self.pts_bbox_head.predict_by_feat( + outs, batch_input_metas, **kwargs) + detsamples = self.add_pred_to_datasample(batch_data_samples, + results_list_3d) + return detsamples + + # may need speed-up + def add_lidar2img(self, batch_input_metas: List[Dict]) -> List[Dict]: + """add 'lidar2img' transformation matrix into batch_input_metas. + Args: - points (list[torch.Tensor], optional): Points of each sample. - Defaults to None. - img_metas (list[dict], optional): Meta information of each sample. - Defaults to None. - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): - Ground truth 3D boxes. Defaults to None. - gt_labels_3d (list[torch.Tensor], optional): Ground truth labels - of 3D boxes. Defaults to None. - gt_labels (list[torch.Tensor], optional): Ground truth labels - of 2D boxes in images. Defaults to None. - gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in - images. Defaults to None. - img (torch.Tensor optional): Images of each sample with shape - (N, C, H, W). Defaults to None. - proposals ([list[torch.Tensor], optional): Predicted proposals - used for training Fast RCNN. Defaults to None. - gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth - 2D boxes in images to be ignored. Defaults to None. + batch_input_metas (list[dict]): Meta information of multiple inputs + in a batch. + Returns: - dict: Losses of different branches. + batch_input_metas (list[dict]): Meta info with lidar2img added """ - img_feats = self.extract_feat(img=img, img_metas=img_metas) - losses = dict() - losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, - gt_labels_3d, img_metas, - gt_bboxes_ignore) - losses.update(losses_pts) - return losses - - def forward_test(self, img_metas, img=None, **kwargs): - for var, name in [(img_metas, 'img_metas')]: - if not isinstance(var, list): - raise TypeError('{} must be a list, but got {}'.format( - name, type(var))) - img = [img] if img is None else img - return self.simple_test(img_metas[0], img[0], **kwargs) - # if num_augs == 1: - # img = [img] if img is None else img - # return self.simple_test(None, img_metas[0], img[0], **kwargs) - # else: - # return self.aug_test(None, img_metas, img, **kwargs) - - def simple_test_pts(self, x, img_metas, rescale=False): - """Test function of point cloud branch.""" - outs = self.pts_bbox_head(x, img_metas) - bbox_list = self.pts_bbox_head.get_bboxes( - outs, img_metas, rescale=rescale) - bbox_results = [ - bbox3d2result(bboxes, scores, labels) - for bboxes, scores, labels in bbox_list - ] - return bbox_results - - def simple_test(self, img_metas, img=None, rescale=False): - """Test function without augmentaiton.""" - img_feats = self.extract_feat(img=img, img_metas=img_metas) - - bbox_list = [dict() for i in range(len(img_metas))] - bbox_pts = self.simple_test_pts( - img_feats, img_metas, rescale=rescale) - for result_dict, pts_bbox in zip(bbox_list, bbox_pts): - result_dict['pts_bbox'] = pts_bbox - return bbox_list - - def aug_test_pts(self, feats, img_metas, rescale=False): - feats_list = [] - for j in range(len(feats[0])): - feats_list_level = [] - for i in range(len(feats)): - feats_list_level.append(feats[i][j]) - feats_list.append(torch.stack(feats_list_level, -1).mean(-1)) - outs = self.pts_bbox_head(feats_list, img_metas) - bbox_list = self.pts_bbox_head.get_bboxes( - outs, img_metas, rescale=rescale) - bbox_results = [ - bbox3d2result(bboxes, scores, labels) - for bboxes, scores, labels in bbox_list - ] - return bbox_results - - def aug_test(self, img_metas, imgs=None, rescale=False): - """Test function with augmentaiton.""" - img_feats = self.extract_feats(img_metas, imgs) - img_metas = img_metas[0] - bbox_list = [dict() for i in range(len(img_metas))] - bbox_pts = self.aug_test_pts(img_feats, img_metas, rescale) - for result_dict, pts_bbox in zip(bbox_list, bbox_pts): - result_dict['pts_bbox'] = pts_bbox - return bbox_list + for meta in batch_input_metas: + l2i = list() + for i in range(len(meta['cam2img'])): + c2i = torch.tensor(meta['cam2img'][i]).double() + l2c = torch.tensor(meta['lidar2cam'][i]).double() + l2i.append(get_lidar2img(c2i, l2c).float().numpy()) + meta['lidar2img'] = l2i + return batch_input_metas + + +#https://github.com/open-mmlab/mmdetection3d/pull/2110 +update_info_BUG_FIX = True + + +def get_lidar2img(cam2img, lidar2cam): + """Get the projection matrix of lidar2img. + + Args: + cam2img (torch.Tensor): A 3x3 or 4x4 projection matrix. + lidar2cam (torch.Tensor): A 3x3 or 4x4 projection matrix. + + Returns: + torch.Tensor: transformation matrix with shape 4x4. + """ + if update_info_BUG_FIX == False: + lidar2cam_r = lidar2cam[:3, :3] + lidar2cam_t = lidar2cam[:3, 3] + lidar2cam_t = torch.matmul(lidar2cam_t, lidar2cam_r.T) + lidar2cam[:3, 3] = lidar2cam_t + if cam2img.shape == (3, 3): + temp = cam2img.new_zeros(4, 4) + temp[:3, :3] = cam2img + temp[3, 3] = 1 + cam2img = temp + + if lidar2cam.shape == (3, 3): + temp = lidar2cam.new_zeros(4, 4) + temp[:3, :3] = lidar2cam + temp[3, 3] = 1 + lidar2cam = temp + return torch.matmul(cam2img, lidar2cam) diff --git a/projects/mmdet3d_plugin/models/detectors/detr3d_old.py b/projects/mmdet3d_plugin/models/detectors/detr3d_old.py new file mode 100644 index 0000000..f5d30bb --- /dev/null +++ b/projects/mmdet3d_plugin/models/detectors/detr3d_old.py @@ -0,0 +1,40 @@ +from typing import Dict, List, Optional, Sequence + +import numpy as np +from mmdet3d.registry import MODELS +from mmdet3d.structures import Det3DDataSample +from torch import Tensor + +from .detr3d import Detr3D + + +@MODELS.register_module() +class Detr3D_old(Detr3D): + """Detr3D for old models trained earlier than mmdet3d-1.0.0.""" + + def __init__(self, **kawrgs): + super().__init__(**kawrgs) + + def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]], + batch_data_samples: List[Det3DDataSample], + **kwargs) -> List[Det3DDataSample]: + + batch_input_metas = [item.metainfo for item in batch_data_samples] + batch_input_metas = self.add_lidar2img(batch_input_metas) + img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas) + outs = self.pts_bbox_head(img_feats, batch_input_metas) + + results_list_3d = self.pts_bbox_head.predict_by_feat( + outs, batch_input_metas, **kwargs) + + # change the bboxes' format + for item in results_list_3d: + #cx, cy, cz, w, l, h, rot, vx, vy + item.bboxes_3d.tensor[..., [3, 4]] = item.bboxes_3d.tensor[..., + [4, 3]] + item.bboxes_3d.tensor[ + ..., 6] = -item.bboxes_3d.tensor[..., 6] - np.pi / 2 + + detsamples = self.add_pred_to_datasample(batch_data_samples, + results_list_3d) + return detsamples diff --git a/projects/mmdet3d_plugin/models/detectors/obj_dgcnn.py b/projects/mmdet3d_plugin/models/detectors/obj_dgcnn.py deleted file mode 100755 index 16fa0de..0000000 --- a/projects/mmdet3d_plugin/models/detectors/obj_dgcnn.py +++ /dev/null @@ -1,187 +0,0 @@ -import torch - -from mmdet.models import DETECTORS -from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d -from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector - - -@DETECTORS.register_module() -class ObjDGCNN(MVXTwoStageDetector): - """Base class of Multi-modality VoxelNet.""" - - def __init__(self, - pts_voxel_layer=None, - pts_voxel_encoder=None, - pts_middle_encoder=None, - pts_fusion_layer=None, - img_backbone=None, - pts_backbone=None, - img_neck=None, - pts_neck=None, - pts_bbox_head=None, - img_roi_head=None, - img_rpn_head=None, - train_cfg=None, - test_cfg=None, - pretrained=None): - super(ObjDGCNN, - self).__init__(pts_voxel_layer, pts_voxel_encoder, - pts_middle_encoder, pts_fusion_layer, - img_backbone, pts_backbone, img_neck, pts_neck, - pts_bbox_head, img_roi_head, img_rpn_head, - train_cfg, test_cfg, pretrained) - - def extract_pts_feat(self, pts, img_feats, img_metas): - """Extract features of points.""" - if not self.with_pts_bbox: - return None - - voxels, num_points, coors = self.voxelize(pts) - - voxel_features = self.pts_voxel_encoder(voxels, num_points, coors) - batch_size = coors[-1, 0] + 1 - x = self.pts_middle_encoder(voxel_features, coors, batch_size) - x = self.pts_backbone(x) - if self.with_pts_neck: - x = self.pts_neck(x) - return x - - def forward_pts_train(self, - pts_feats, - gt_bboxes_3d, - gt_labels_3d, - img_metas, - gt_bboxes_ignore=None): - """Forward function for point cloud branch. - Args: - pts_feats (list[torch.Tensor]): Features of point cloud branch - gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth - boxes for each sample. - gt_labels_3d (list[torch.Tensor]): Ground truth labels for - boxes of each sampole - img_metas (list[dict]): Meta information of samples. - gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth - boxes to be ignored. Defaults to None. - Returns: - dict: Losses of each branch. - """ - outs = self.pts_bbox_head(pts_feats) - loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] - losses = self.pts_bbox_head.loss(*loss_inputs) - return losses - - def simple_test_pts(self, x, img_metas, rescale=False): - """Test function of point cloud branch.""" - outs = self.pts_bbox_head(x) - bbox_list = self.pts_bbox_head.get_bboxes( - outs, img_metas, rescale=rescale) - bbox_results = [ - bbox3d2result(bboxes, scores, labels) - for bboxes, scores, labels in bbox_list - ] - return bbox_results - - def aug_test_pts(self, feats, img_metas, rescale=False): - """Test function of point cloud branch with augmentaiton. - The function implementation process is as follows: - - step 1: map features back for double-flip augmentation. - - step 2: merge all features and generate boxes. - - step 3: map boxes back for scale augmentation. - - step 4: merge results. - Args: - feats (list[torch.Tensor]): Feature of point cloud. - img_metas (list[dict]): Meta information of samples. - rescale (bool): Whether to rescale bboxes. Default: False. - Returns: - dict: Returned bboxes consists of the following keys: - - boxes_3d (:obj:`LiDARInstance3DBoxes`): Predicted bboxes. - - scores_3d (torch.Tensor): Scores of predicted boxes. - - labels_3d (torch.Tensor): Labels of predicted boxes. - """ - # only support aug_test for one sample - outs_list = [] - for x, img_meta in zip(feats, img_metas): - outs = self.pts_bbox_head(x[0]) - # merge augmented outputs before decoding bboxes - for task_id, out in enumerate(outs): - for key in out[0].keys(): - if img_meta[0]['pcd_horizontal_flip']: - outs[task_id][0][key] = torch.flip( - outs[task_id][0][key], dims=[2]) - if key == 'reg': - outs[task_id][0][key][:, 1, ...] = 1 - outs[ - task_id][0][key][:, 1, ...] - elif key == 'rot': - outs[task_id][0][ - key][:, 1, - ...] = -outs[task_id][0][key][:, 1, ...] - elif key == 'vel': - outs[task_id][0][ - key][:, 1, - ...] = -outs[task_id][0][key][:, 1, ...] - if img_meta[0]['pcd_vertical_flip']: - outs[task_id][0][key] = torch.flip( - outs[task_id][0][key], dims=[3]) - if key == 'reg': - outs[task_id][0][key][:, 0, ...] = 1 - outs[ - task_id][0][key][:, 0, ...] - elif key == 'rot': - outs[task_id][0][ - key][:, 0, - ...] = -outs[task_id][0][key][:, 0, ...] - elif key == 'vel': - outs[task_id][0][ - key][:, 0, - ...] = -outs[task_id][0][key][:, 0, ...] - - outs_list.append(outs) - - preds_dicts = dict() - scale_img_metas = [] - - # concat outputs sharing the same pcd_scale_factor - for i, (img_meta, outs) in enumerate(zip(img_metas, outs_list)): - pcd_scale_factor = img_meta[0]['pcd_scale_factor'] - if pcd_scale_factor not in preds_dicts.keys(): - preds_dicts[pcd_scale_factor] = outs - scale_img_metas.append(img_meta) - else: - for task_id, out in enumerate(outs): - for key in out[0].keys(): - preds_dicts[pcd_scale_factor][task_id][0][key] += out[ - 0][key] - - aug_bboxes = [] - - for pcd_scale_factor, preds_dict in preds_dicts.items(): - for task_id, pred_dict in enumerate(preds_dict): - # merge outputs with different flips before decoding bboxes - for key in pred_dict[0].keys(): - preds_dict[task_id][0][key] /= len(outs_list) / len( - preds_dicts.keys()) - bbox_list = self.pts_bbox_head.get_bboxes( - preds_dict, img_metas[0], rescale=rescale) - bbox_list = [ - dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) - for bboxes, scores, labels in bbox_list - ] - aug_bboxes.append(bbox_list[0]) - - if len(preds_dicts.keys()) > 1: - # merge outputs with different scales after decoding bboxes - merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, scale_img_metas, - self.pts_bbox_head.test_cfg) - return merged_bboxes - else: - for key in bbox_list[0].keys(): - bbox_list[0][key] = bbox_list[0][key].to('cpu') - return bbox_list[0] - - def aug_test(self, points, img_metas, imgs=None, rescale=False): - """Test function with augmentaiton.""" - img_feats, pts_feats = self.extract_feats(points, img_metas, imgs) - bbox_list = dict() - if pts_feats and self.with_pts_bbox: - pts_bbox = self.aug_test_pts(pts_feats, img_metas, rescale) - bbox_list.update(pts_bbox=pts_bbox) - return [bbox_list] \ No newline at end of file diff --git a/projects/mmdet3d_plugin/models/task_modules/__init__.py b/projects/mmdet3d_plugin/models/task_modules/__init__.py new file mode 100644 index 0000000..82e994e --- /dev/null +++ b/projects/mmdet3d_plugin/models/task_modules/__init__.py @@ -0,0 +1,3 @@ +from .hungarian_assigner_3d import HungarianAssigner3D +from .match_cost import BBox3DL1Cost +from .nms_free_coder import NMSFreeCoder diff --git a/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py b/projects/mmdet3d_plugin/models/task_modules/hungarian_assigner_3d.py similarity index 63% rename from projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py rename to projects/mmdet3d_plugin/models/task_modules/hungarian_assigner_3d.py index ccb4253..f5f8167 100755 --- a/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py +++ b/projects/mmdet3d_plugin/models/task_modules/hungarian_assigner_3d.py @@ -1,11 +1,14 @@ +from typing import List + import torch +from mmdet3d.registry import TASK_UTILS +from mmdet.models.task_modules.assigners import AssignResult # check +from mmdet.models.task_modules.assigners import BaseAssigner +from mmengine.structures import InstanceData +from torch import Tensor -from mmdet.core.bbox.builder import BBOX_ASSIGNERS -from mmdet.core.bbox.assigners import AssignResult -from mmdet.core.bbox.assigners import BaseAssigner -from mmdet.core.bbox.match_costs import build_match_cost -from mmdet.models.utils.transformer import inverse_sigmoid -from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox +# from mmdet.core.bbox.match_costs import build_match_cost +from .util import normalize_bbox try: from scipy.optimize import linear_sum_assignment @@ -13,49 +16,46 @@ linear_sum_assignment = None -@BBOX_ASSIGNERS.register_module() +@TASK_UTILS.register_module() class HungarianAssigner3D(BaseAssigner): """Computes one-to-one matching between predictions and ground truth. + This class computes an assignment between the targets and the predictions - based on the costs. The costs are weighted sum of three components: - classification cost, regression L1 cost and regression iou cost. The - targets don't include the no_object, so generally there are more - predictions than targets. After the one-to-one matching, the un-matched - are treated as backgrounds. Thus each query prediction will be assigned - with `0` or a positive integer indicating the ground truth index: + based on the costs. The costs are weighted sum of some components. + For DETR3D the costs are weighted sum of classification cost, regression L1 + cost and regression iou cost. The targets don't include the no_object, so + generally there are more predictions than targets. After the one-to-one + matching, the un-matched are treated as backgrounds. Thus each query + prediction will be assigned with `0` or a positive integer indicating the + ground truth index: + - 0: negative sample, no assigned gt - positive integer: positive sample, index (1-based) of assigned gt + Args: - cls_weight (int | float, optional): The scale factor for classification - cost. Default 1.0. - bbox_weight (int | float, optional): The scale factor for regression - L1 cost. Default 1.0. - iou_weight (int | float, optional): The scale factor for regression - iou cost. Default 1.0. - iou_calculator (dict | optional): The config for the iou calculation. - Default type `BboxOverlaps2D`. - iou_mode (str | optional): "iou" (intersection over union), "iof" - (intersection over foreground), or "giou" (generalized - intersection over union). Default "giou". + cls_cost (obj:`ConfigDict`) : Match cost configs. + reg_cost. + iou_cost. + pc_range: perception range of the detector """ def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxL1Cost', weight=1.0), iou_cost=dict(type='IoUCost', weight=0.0), - pc_range=None): - self.cls_cost = build_match_cost(cls_cost) - self.reg_cost = build_match_cost(reg_cost) - self.iou_cost = build_match_cost(iou_cost) + pc_range: List = None): + self.cls_cost = TASK_UTILS.build(cls_cost) + self.reg_cost = TASK_UTILS.build(reg_cost) + self.iou_cost = TASK_UTILS.build(iou_cost) self.pc_range = pc_range def assign(self, - bbox_pred, - cls_pred, - gt_bboxes, - gt_labels, + bbox_pred: Tensor, + cls_pred: Tensor, + gt_bboxes: Tensor, + gt_labels: Tensor, gt_bboxes_ignore=None, - eps=1e-7): + eps=1e-7) -> AssignResult: """Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, @@ -70,23 +70,22 @@ def assign(self, and assign the corresponding gt index (plus 1) to it. Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates - (cx, cy, w, h), which are all in range [0, 1]. Shape - [num_query, 4]. + (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) which are all in range [0, 1]. + Shape [num_query, 10]. cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_bboxes (Tensor): Ground truth boxes with unnormalized - coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. + coordinates (cx,cy,cz,l,w,h,φ,v_x,v_y). Shape [num_gt, 9]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`. Default None. - eps (int | float, optional): A value added to the denominator for - numerical stability. Default 1e-7. + eps (int | float, optional): unused parameter Returns: :obj:`AssignResult`: The assigned result. """ assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' - num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) + num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) #9, 900 # 1. assign -1 by default assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), @@ -100,19 +99,24 @@ def assign(self, if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 - return AssignResult( - num_gts, assigned_gt_inds, None, labels=assigned_labels) + return AssignResult(num_gts, + assigned_gt_inds, + None, + labels=assigned_labels) # 2. compute the weighted costs # classification and bboxcost. - cls_cost = self.cls_cost(cls_pred, gt_labels) + # # dev1.x interface alignment + pred_instances = InstanceData(scores=cls_pred) + gt_instances = InstanceData(labels=gt_labels) + cls_cost = self.cls_cost(pred_instances, gt_instances) # regression L1 cost normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range) reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8]) - + # weighted sum of above two costs cost = cls_cost + reg_cost - + # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: @@ -130,5 +134,7 @@ def assign(self, # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] - return AssignResult( - num_gts, assigned_gt_inds, None, labels=assigned_labels) \ No newline at end of file + return AssignResult(num_gts, + assigned_gt_inds, + None, + labels=assigned_labels) diff --git a/projects/mmdet3d_plugin/models/task_modules/match_cost.py b/projects/mmdet3d_plugin/models/task_modules/match_cost.py new file mode 100755 index 0000000..89152f1 --- /dev/null +++ b/projects/mmdet3d_plugin/models/task_modules/match_cost.py @@ -0,0 +1,32 @@ +from typing import List, Optional, Union + +import torch +from mmdet3d.registry import TASK_UTILS +from torch import Tensor + + +@TASK_UTILS.register_module() +class BBox3DL1Cost(object): + """BBox3DL1Cost. + + Args: + weight (Union[float, int]): Cost weight. Defaults to 1. + """ + + def __init__(self, weight: Union[float, int] = 1.): + self.weight = weight + + def __call__(self, bbox_pred: Tensor, gt_bboxes: Tensor) -> Tensor: + """Compute match cost. + + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) which are all in range [0, 1]. + Shape [num_query, 10]. + gt_bboxes (Tensor): Ground truth boxes with `normalized` + coordinates (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y). Shape [num_gt, 10]. + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) + return bbox_cost * self.weight diff --git a/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py b/projects/mmdet3d_plugin/models/task_modules/nms_free_coder.py similarity index 73% rename from projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py rename to projects/mmdet3d_plugin/models/task_modules/nms_free_coder.py index 11eafc3..8cbe3ac 100755 --- a/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py +++ b/projects/mmdet3d_plugin/models/task_modules/nms_free_coder.py @@ -1,13 +1,14 @@ import torch +from mmdet3d.registry import TASK_UTILS +from mmdet.models.task_modules import BaseBBoxCoder -from mmdet.core.bbox import BaseBBoxCoder -from mmdet.core.bbox.builder import BBOX_CODERS -from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox +from .util import denormalize_bbox -@BBOX_CODERS.register_module() +@TASK_UTILS.register_module() class NMSFreeCoder(BaseBBoxCoder): """Bbox coder for NMS-free detector. + Args: pc_range (list[float]): Range of point cloud. post_center_range (list[float]): Limit of the center. @@ -15,17 +16,16 @@ class NMSFreeCoder(BaseBBoxCoder): max_num (int): Max number to be kept. Default: 100. score_threshold (float): Threshold to filter boxes based on score. Default: None. - code_size (int): Code size of bboxes. Default: 9 """ def __init__(self, - pc_range, + pc_range=None, voxel_size=None, post_center_range=None, max_num=100, score_threshold=None, num_classes=10): - + self.pc_range = pc_range self.voxel_size = voxel_size self.post_center_range = post_center_range @@ -38,34 +38,36 @@ def encode(self): def decode_single(self, cls_scores, bbox_preds): """Decode bboxes. + Args: cls_scores (Tensor): Outputs from the classification head, \ shape [num_query, cls_out_channels]. Note \ cls_out_channels should includes background. bbox_preds (Tensor): Outputs from the regression \ - head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ - Shape [num_query, 9]. + head with normalized coordinate format (cx, cy, l, w, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [num_query, 10]. Returns: list[dict]: Decoded boxes. """ max_num = self.max_num cls_scores = cls_scores.sigmoid() - scores, indexs = cls_scores.view(-1).topk(max_num) - labels = indexs % self.num_classes - bbox_index = indexs // self.num_classes + scores, indexes = cls_scores.view(-1).topk(max_num) + labels = indexes % self.num_classes + bbox_index = indexes // self.num_classes bbox_preds = bbox_preds[bbox_index] - final_box_preds = denormalize_bbox(bbox_preds, self.pc_range) - final_scores = scores - final_preds = labels + # [[cx, cy, cz, l, w, h, rot, vx, vy]] + final_box_preds = denormalize_bbox(bbox_preds, None) + final_scores = scores + final_preds = labels # use score threshold if self.score_threshold is not None: thresh_mask = final_scores > self.score_threshold if self.post_center_range is not None: - self.post_center_range = torch.tensor( - self.post_center_range, device=scores.device) + self.post_center_range = torch.tensor(self.post_center_range, + device=scores.device) mask = (final_box_preds[..., :3] >= self.post_center_range[:3]).all(1) mask &= (final_box_preds[..., :3] <= @@ -91,21 +93,24 @@ def decode_single(self, cls_scores, bbox_preds): def decode(self, preds_dicts): """Decode bboxes. + Args: all_cls_scores (Tensor): Outputs from the classification head, \ shape [nb_dec, bs, num_query, cls_out_channels]. Note \ cls_out_channels should includes background. all_bbox_preds (Tensor): Sigmoid outputs from the regression \ - head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ - Shape [nb_dec, bs, num_query, 9]. + head with normalized coordinate format (cx, cy, l, w, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [nb_dec, bs, num_query, 10]. Returns: list[dict]: Decoded boxes. """ - all_cls_scores = preds_dicts['all_cls_scores'][-1] + all_cls_scores = preds_dicts['all_cls_scores'][ + -1] #cls & reg target of last decoder layer all_bbox_preds = preds_dicts['all_bbox_preds'][-1] - + batch_size = all_cls_scores.size()[0] predictions_list = [] for i in range(batch_size): - predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i])) - return predictions_list \ No newline at end of file + predictions_list.append( + self.decode_single(all_cls_scores[i], all_bbox_preds[i])) + return predictions_list diff --git a/projects/mmdet3d_plugin/models/task_modules/util.py b/projects/mmdet3d_plugin/models/task_modules/util.py new file mode 100755 index 0000000..ea1e51b --- /dev/null +++ b/projects/mmdet3d_plugin/models/task_modules/util.py @@ -0,0 +1,76 @@ +from typing import List + +import torch +from torch import Tensor + + +def normalize_bbox(bboxes: Tensor, pc_range: List) -> Tensor: + """ normalize bboxes + Args: + bboxes (Tensor): boxes with unnormalized + coordinates (cx,cy,cz,l,w,h,φ,v_x,v_y). Shape [num_gt, 9]. + pc_range (List): Perception range of the detector + Returns: + normalized_bboxes (Tensor): boxes with normalized coordinates \ + (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y), which are all in range [0, 1]. \ + Shape [num_query, 10]. + """ + + cx = bboxes[..., 0:1] + cy = bboxes[..., 1:2] + cz = bboxes[..., 2:3] + w = bboxes[..., 3:4].log() + l = bboxes[..., 4:5].log() + h = bboxes[..., 5:6].log() + + rot = bboxes[..., 6:7] + if bboxes.size(-1) > 7: + vx = bboxes[..., 7:8] + vy = bboxes[..., 8:9] + normalized_bboxes = torch.cat( + (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1) + else: + normalized_bboxes = torch.cat( + (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1) + return normalized_bboxes + + +def denormalize_bbox(normalized_bboxes, pc_range): + """ denormalize bboxes + Args: + normalized_bboxes (Tensor): boxes with normalized coordinates \ + (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y), which are all in range [0, 1]. \ + Shape [num_query, 10]. + pc_range (List): Perception range of the detector + Returns: + denormalized_bboxes (Tensor): boxes with unnormalized + coordinates (cx,cy,cz,l,w,h,φ,v_x,v_y). Shape [num_gt, 9]. + """ + # rotation + rot_sine = normalized_bboxes[..., 6:7] + + rot_cosine = normalized_bboxes[..., 7:8] + rot = torch.atan2(rot_sine, rot_cosine) + + # center in the bev + cx = normalized_bboxes[..., 0:1] + cy = normalized_bboxes[..., 1:2] + cz = normalized_bboxes[..., 4:5] + + # size, the meaning of w,l may alter in different version of mmdet3d + w = normalized_bboxes[..., 2:3] + l = normalized_bboxes[..., 3:4] + h = normalized_bboxes[..., 5:6] + + w = w.exp() + l = l.exp() + h = h.exp() + if normalized_bboxes.size(-1) > 8: + # velocity + vx = normalized_bboxes[:, 8:9] + vy = normalized_bboxes[:, 9:10] + denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], + dim=-1) + else: + denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) + return denormalized_bboxes diff --git a/projects/mmdet3d_plugin/models/utils/__init__.py b/projects/mmdet3d_plugin/models/utils/__init__.py index 797ac7f..54bdea5 100755 --- a/projects/mmdet3d_plugin/models/utils/__init__.py +++ b/projects/mmdet3d_plugin/models/utils/__init__.py @@ -1,6 +1,2 @@ -from .dgcnn_attn import DGCNNAttn -from .detr import Deformable3DDetrTransformerDecoder -from .detr3d_transformer import Detr3DTransformer, Detr3DTransformerDecoder, Detr3DCrossAtten - -__all__ = ['DGCNNAttn', 'Deformable3DDetrTransformerDecoder', - 'Detr3DTransformer', 'Detr3DTransformerDecoder', 'Detr3DCrossAtten'] +from .detr3d_transformer import (Detr3DCrossAtten, Detr3DTransformer, + Detr3DTransformerDecoder) diff --git a/projects/mmdet3d_plugin/models/utils/detr.py b/projects/mmdet3d_plugin/models/utils/detr.py deleted file mode 100755 index ff72d32..0000000 --- a/projects/mmdet3d_plugin/models/utils/detr.py +++ /dev/null @@ -1,106 +0,0 @@ -import torch -from mmcv.cnn.bricks.registry import TRANSFORMER_LAYER_SEQUENCE -from mmcv.cnn.bricks.transformer import TransformerLayerSequence - - -def inverse_sigmoid(x, eps=1e-5): - """Inverse function of sigmoid. - Args: - x (Tensor): The tensor to do the - inverse. - eps (float): EPS avoid numerical - overflow. Defaults 1e-5. - Returns: - Tensor: The x has passed the inverse - function of sigmoid, has same - shape with input. - """ - x = x.clamp(min=0, max=1) - x1 = x.clamp(min=eps) - x2 = (1 - x).clamp(min=eps) - return torch.log(x1 / x2) - - -@TRANSFORMER_LAYER_SEQUENCE.register_module() -class Deformable3DDetrTransformerDecoder(TransformerLayerSequence): - """Copy the decoder in DETR transformer. - Args: - return_intermediate (bool): Whether to return intermediate outputs. - coder_norm_cfg (dict): Config of last normalization layer. Default: - `LN`. - """ - - def __init__(self, *args, return_intermediate=False, **kwargs): - super(Deformable3DDetrTransformerDecoder, self).__init__(*args, **kwargs) - self.return_intermediate = return_intermediate - - def forward(self, - query, - *args, - reference_points=None, - valid_ratios=None, - reg_branches=None, - **kwargs): - """Forward function for `TransformerDecoder`. - Args: - query (Tensor): Input query with shape - `(num_query, bs, embed_dims)`. - reference_points (Tensor): The reference - points of offset. has shape - (bs, num_query, 4) when as_two_stage, - otherwise has shape ((bs, num_query, 2). - valid_ratios (Tensor): The radios of valid - points on the feature map, has shape - (bs, num_levels, 2) - reg_branch: (obj:`nn.ModuleList`): Used for - refining the regression results. Only would - be passed when with_box_refine is True, - otherwise would be passed a `None`. - Returns: - Tensor: Results with shape [1, num_query, bs, embed_dims] when - return_intermediate is `False`, otherwise it has shape - [num_layers, num_query, bs, embed_dims]. - """ - output = query - intermediate = [] - intermediate_reference_points = [] - for lid, layer in enumerate(self.layers): - if reference_points.shape[-1] == 4: - reference_points_input = reference_points[:, :, None] * \ - torch.cat([valid_ratios, valid_ratios], -1)[:, None] - else: - assert reference_points.shape[-1] == 2 - reference_points_input = reference_points[:, :, None] * \ - valid_ratios[:, None] - output = layer( - output, - *args, - reference_points=reference_points_input, - **kwargs) - output = output.permute(1, 0, 2) - - if reg_branches is not None: - tmp = reg_branches[lid](output) - if reference_points.shape[-1] == 4: - new_reference_points = tmp + inverse_sigmoid( - reference_points) - new_reference_points = new_reference_points.sigmoid() - else: - assert reference_points.shape[-1] == 2 - # This is to deal with the different output number (10). - # new_reference_points = tmp - new_reference_points = tmp[ - ..., :2] + inverse_sigmoid(reference_points) - new_reference_points = new_reference_points.sigmoid() - reference_points = new_reference_points.detach() - - output = output.permute(1, 0, 2) - if self.return_intermediate: - intermediate.append(output) - intermediate_reference_points.append(reference_points) - - if self.return_intermediate: - return torch.stack(intermediate), torch.stack( - intermediate_reference_points) - - return output, reference_points diff --git a/projects/mmdet3d_plugin/models/utils/detr3d_transformer.py b/projects/mmdet3d_plugin/models/utils/detr3d_transformer.py index 2a94e5c..b3cf281 100755 --- a/projects/mmdet3d_plugin/models/utils/detr3d_transformer.py +++ b/projects/mmdet3d_plugin/models/utils/detr3d_transformer.py @@ -1,21 +1,20 @@ +import time +import warnings import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -from mmcv.cnn import xavier_init, constant_init -from mmcv.cnn.bricks.registry import (ATTENTION, - TRANSFORMER_LAYER_SEQUENCE) -from mmcv.cnn.bricks.transformer import (MultiScaleDeformableAttention, - TransformerLayerSequence, +from mmcv.cnn.bricks.transformer import (TransformerLayerSequence, build_transformer_layer_sequence) -from mmcv.runner.base_module import BaseModule - -from mmdet.models.utils.builder import TRANSFORMER +from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention +from mmdet3d.registry import MODELS +from mmengine.model import BaseModule, constant_init, xavier_init def inverse_sigmoid(x, eps=1e-5): """Inverse function of sigmoid. + Args: x (Tensor): The tensor to do the inverse. @@ -32,14 +31,17 @@ def inverse_sigmoid(x, eps=1e-5): return torch.log(x1 / x2) -@TRANSFORMER.register_module() +@MODELS.register_module() class Detr3DTransformer(BaseModule): """Implements the Detr3D transformer. + Args: as_two_stage (bool): Generate query from encoder features. Default: False. num_feature_levels (int): Number of feature maps from FPN: Default: 4. + num_cams (int): Number of cameras in the dataset. + Default: 6 in NuScenes Det. two_stage_num_proposals (int): Number of proposals when set `as_two_stage` as True. Default: 300. """ @@ -68,29 +70,24 @@ def init_weights(self): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): - if isinstance(m, MultiScaleDeformableAttention) or isinstance(m, Detr3DCrossAtten): + if isinstance(m, MultiScaleDeformableAttention) or isinstance( + m, Detr3DCrossAtten): m.init_weight() xavier_init(self.reference_points, distribution='uniform', bias=0.) - def forward(self, - mlvl_feats, - query_embed, - reg_branches=None, - **kwargs): + def forward(self, mlvl_feats, query_embed, reg_branches=None, **kwargs): """Forward function for `Detr3DTransformer`. Args: mlvl_feats (list(Tensor)): Input queries from different level. Each element has shape - [bs, embed_dims, h, w]. - query_embed (Tensor): The query embedding for decoder, - with shape [num_query, c]. + (B, N, C, H_lvl, W_lvl). + query_embed (Tensor): The query positional and semantic embedding for decoder, with shape [num_query, c+c]. mlvl_pos_embeds (list(Tensor)): The positional encoding - of feats from different level, has the shape - [bs, embed_dims, h, w]. + of feats from different level, has the shape [bs, N, embed_dims, h, w]. + It is unused here. reg_branches (obj:`nn.ModuleList`): Regression heads for feature maps from each decoder layer. Only would - be passed when - `with_box_refine` is True. Default to None. + be passed when `with_box_refine` is True. Default to None. Returns: tuple[Tensor]: results of decoder containing the following tensor. - inter_states: Outputs from decoder. If @@ -101,24 +98,13 @@ def forward(self, points, has shape (bs, num_queries, 4). - inter_references_out: The internal value of reference \ points in decoder, has shape \ - (num_dec_layers, bs,num_query, embed_dims) - - enc_outputs_class: The classification score of \ - proposals generated from \ - encoder's feature maps, has shape \ - (batch, h*w, num_classes). \ - Only would be returned when `as_two_stage` is True, \ - otherwise None. - - enc_outputs_coord_unact: The regression results \ - generated from encoder's feature maps., has shape \ - (batch, h*w, 4). Only would \ - be returned when `as_two_stage` is True, \ - otherwise None. + (num_dec_layers, bs, num_query, embed_dims) """ assert query_embed is not None bs = mlvl_feats[0].size(0) - query_pos, query = torch.split(query_embed, self.embed_dims , dim=1) - query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) - query = query.unsqueeze(0).expand(bs, -1, -1) + query_pos, query = torch.split(query_embed, self.embed_dims, dim=1) + query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) #[bs,num_q,c] + query = query.unsqueeze(0).expand(bs, -1, -1) #[bs,num_q,c] reference_points = self.reference_points(query_pos) reference_points = reference_points.sigmoid() init_reference_out = reference_points @@ -139,9 +125,10 @@ def forward(self, return inter_states, init_reference_out, inter_references_out -@TRANSFORMER_LAYER_SEQUENCE.register_module() +@MODELS.register_module() class Detr3DTransformerDecoder(TransformerLayerSequence): """Implements the decoder in DETR3D transformer. + Args: return_intermediate (bool): Whether to return intermediate outputs. coder_norm_cfg (dict): Config of last normalization layer. Default: @@ -165,7 +152,7 @@ def forward(self, reference_points (Tensor): The reference points of offset. has shape (bs, num_query, 4) when as_two_stage, - otherwise has shape ((bs, num_query, 2). + otherwise has shape self.reference_points = nn.Linear(self.embed_dims, 3) reg_branch: (obj:`nn.ModuleList`): Used for refining the regression results. Only would be passed when with_box_refine is True, @@ -178,26 +165,24 @@ def forward(self, output = query intermediate = [] intermediate_reference_points = [] - for lid, layer in enumerate(self.layers): + for lid, layer in enumerate(self.layers): ## iterative refinement reference_points_input = reference_points - output = layer( - output, - *args, - reference_points=reference_points_input, - **kwargs) + output = layer(output, + *args, + reference_points=reference_points_input, + **kwargs) output = output.permute(1, 0, 2) - if reg_branches is not None: tmp = reg_branches[lid](output) - + assert reference_points.shape[-1] == 3 new_reference_points = torch.zeros_like(reference_points) - new_reference_points[..., :2] = tmp[ - ..., :2] + inverse_sigmoid(reference_points[..., :2]) - new_reference_points[..., 2:3] = tmp[ - ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3]) - + new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid( + reference_points[..., :2]) + new_reference_points[..., + 2:3] = tmp[..., 4:5] + inverse_sigmoid( + reference_points[..., 2:3]) new_reference_points = new_reference_points.sigmoid() reference_points = new_reference_points.detach() @@ -214,9 +199,10 @@ def forward(self, return output, reference_points -@ATTENTION.register_module() +@MODELS.register_module() class Detr3DCrossAtten(BaseModule): - """An attention module used in Detr3d. + """An attention module used in Detr3d. + Args: embed_dims (int): The embedding dimension of Attention. Default: 256. @@ -233,18 +219,21 @@ class Detr3DCrossAtten(BaseModule): Default: None. """ - def __init__(self, - embed_dims=256, - num_heads=8, - num_levels=4, - num_points=5, - num_cams=6, - im2col_step=64, - pc_range=None, - dropout=0.1, - norm_cfg=None, - init_cfg=None, - batch_first=False): + def __init__( + self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=5, + num_cams=6, + im2col_step=64, + pc_range=None, + dropout=0.1, + norm_cfg=None, + init_cfg=None, + batch_first=False, + # waymo_with_nuscene=False + ): super(Detr3DCrossAtten, self).__init__(init_cfg) if embed_dims % num_heads != 0: raise ValueError(f'embed_dims must be divisible by num_heads, ' @@ -278,20 +267,20 @@ def _is_power_of_2(n): self.num_points = num_points self.num_cams = num_cams self.attention_weights = nn.Linear(embed_dims, - num_cams*num_levels*num_points) + num_cams * num_levels * num_points) self.output_proj = nn.Linear(embed_dims, embed_dims) - + self.position_encoder = nn.Sequential( - nn.Linear(3, self.embed_dims), + nn.Linear(3, self.embed_dims), nn.LayerNorm(self.embed_dims), nn.ReLU(inplace=True), - nn.Linear(self.embed_dims, self.embed_dims), + nn.Linear(self.embed_dims, self.embed_dims), nn.LayerNorm(self.embed_dims), nn.ReLU(inplace=True), ) self.batch_first = batch_first - + # self.waymo_with_nuscene = waymo_with_nuscene self.init_weight() def init_weight(self): @@ -305,44 +294,27 @@ def forward(self, value, residual=None, query_pos=None, - key_padding_mask=None, reference_points=None, - spatial_shapes=None, - level_start_index=None, **kwargs): """Forward Function of Detr3DCrossAtten. + Args: query (Tensor): Query of Transformer with shape (num_query, bs, embed_dims). key (Tensor): The key tensor with shape `(num_key, bs, embed_dims)`. - value (Tensor): The value tensor with shape - `(num_key, bs, embed_dims)`. (B, N, C, H, W) + value (List[Tensor]): Image features from + different level. Each element has shape + (B, N, C, H_lvl, W_lvl). residual (Tensor): The tensor used for addition, with the same shape as `x`. Default None. If None, `x` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. - key_pos (Tensor): The positional encoding for `key`. Default - None. - reference_points (Tensor): The normalized reference - points with shape (bs, num_query, 4), - all elements is range in [0, 1], top-left (0,0), - bottom-right (1, 1), including padding area. - or (N, Length_{query}, num_levels, 4), add - additional two dimensions is (w, h) to - form reference boxes. - key_padding_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_key]. - spatial_shapes (Tensor): Spatial shape of features in - different level. With shape (num_levels, 2), - last dimension represent (h, w). - level_start_index (Tensor): The start index of each level. - A tensor has shape (num_levels) and can be represented - as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + reference_points (Tensor): The normalized 3D reference + points with shape (bs, num_query, 3) Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ - if key is None: key = query if value is None: @@ -353,70 +325,119 @@ def forward(self, if query_pos is not None: query = query + query_pos - # change to (bs, num_query, embed_dims) query = query.permute(1, 0, 2) bs, num_query, _ = query.size() attention_weights = self.attention_weights(query).view( bs, 1, num_query, self.num_cams, self.num_points, self.num_levels) - reference_points_3d, output, mask = feature_sampling( value, reference_points, self.pc_range, kwargs['img_metas']) output = torch.nan_to_num(output) mask = torch.nan_to_num(mask) - + # if self.waymo_with_nuscene == True: + # num_view = mask.shape[3] + # attention_weights = attention_weights[:, :, :, :num_view, ...] attention_weights = attention_weights.sigmoid() * mask output = output * attention_weights output = output.sum(-1).sum(-1).sum(-1) output = output.permute(2, 0, 1) - - output = self.output_proj(output) # (num_query, bs, embed_dims) - pos_feat = self.position_encoder(inverse_sigmoid(reference_points_3d)).permute(1, 0, 2) - + output = self.output_proj(output) + pos_feat = self.position_encoder( + inverse_sigmoid(reference_points_3d)).permute(1, 0, 2) return self.dropout(output) + inp_residual + pos_feat -def feature_sampling(mlvl_feats, reference_points, pc_range, img_metas): - lidar2img = [] - for img_meta in img_metas: - lidar2img.append(img_meta['lidar2img']) +def feature_sampling( + mlvl_feats, + ref_pt, + pc_range, + img_metas, + # return_depth=False +): + """ sample multi-level features by projecting 3D reference points to 2D image + Args: + mlvl_feats (List[Tensor]): Image features from + different level. Each element has shape + (B, N, C, H_lvl, W_lvl). + ref_pt (Tensor): The normalized 3D reference + points with shape (bs, num_query, 3) + pc_range: perception range of the detector + img_metas (list[dict]): Meta information of multiple inputs + in a batch, containing `lidar2img`. + Returns: + ref_pt_3d (Tensor): A copy of original ref_pt + sampled_feats (Tensor): sampled features with shape \ + (B C num_q N 1 fpn_lvl) + mask (Tensor): Determine whether the reference point \ + has projected outsied of images, with shape \ + (B 1 num_q N 1 1) + """ + lidar2img = [meta['lidar2img'] for meta in img_metas] lidar2img = np.asarray(lidar2img) - lidar2img = reference_points.new_tensor(lidar2img) # (B, N, 4, 4) - reference_points = reference_points.clone() - reference_points_3d = reference_points.clone() - reference_points[..., 0:1] = reference_points[..., 0:1]*(pc_range[3] - pc_range[0]) + pc_range[0] - reference_points[..., 1:2] = reference_points[..., 1:2]*(pc_range[4] - pc_range[1]) + pc_range[1] - reference_points[..., 2:3] = reference_points[..., 2:3]*(pc_range[5] - pc_range[2]) + pc_range[2] - # reference_points (B, num_queries, 4) - reference_points = torch.cat((reference_points, torch.ones_like(reference_points[..., :1])), -1) - B, num_query = reference_points.size()[:2] + lidar2img = ref_pt.new_tensor(lidar2img) + ref_pt = ref_pt.clone() + ref_pt_3d = ref_pt.clone() + + B, num_query = ref_pt.size()[:2] num_cam = lidar2img.size(1) - reference_points = reference_points.view(B, 1, num_query, 4).repeat(1, num_cam, 1, 1).unsqueeze(-1) - lidar2img = lidar2img.view(B, num_cam, 1, 4, 4).repeat(1, 1, num_query, 1, 1) - reference_points_cam = torch.matmul(lidar2img, reference_points).squeeze(-1) eps = 1e-5 - mask = (reference_points_cam[..., 2:3] > eps) - reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum( - reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3])*eps) - reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1] - reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0] - reference_points_cam = (reference_points_cam - 0.5) * 2 - mask = (mask & (reference_points_cam[..., 0:1] > -1.0) - & (reference_points_cam[..., 0:1] < 1.0) - & (reference_points_cam[..., 1:2] > -1.0) - & (reference_points_cam[..., 1:2] < 1.0)) + + ref_pt[..., 0:1] = \ + ref_pt[..., 0:1] * (pc_range[3] - pc_range[0]) + pc_range[0] #x + ref_pt[..., 1:2] = \ + ref_pt[..., 1:2] * (pc_range[4] - pc_range[1]) + pc_range[1] #y + ref_pt[..., 2:3] = \ + ref_pt[..., 2:3] * (pc_range[5] - pc_range[2]) + pc_range[2] #z + + #(B num_q 3) -> (B num_q 4) -> (B 1 num_q 4) -> (B num_cam num_q 4 1) + ref_pt = torch.cat((ref_pt, torch.ones_like(ref_pt[..., :1])), -1) + ref_pt = ref_pt.view(B, 1, num_query, 4) + ref_pt = ref_pt.repeat(1, num_cam, 1, 1).unsqueeze(-1) + #(B num_cam 4 4) -> (B num_cam num_q 4 4) + lidar2img = lidar2img.view(B, num_cam, 1, 4, 4)\ + .repeat(1, 1, num_query, 1, 1) + #(... 4 4) * (... 4 1) -> (B num_cam num_q 4) + pt_cam = torch.matmul(lidar2img, ref_pt).squeeze(-1) + + # (B num_cam num_q) + z = pt_cam[..., 2:3] + eps = eps * torch.ones_like(z) + mask = (z > eps) + pt_cam = pt_cam[..., 0:2] / torch.maximum(z, eps) # prevent zero-division + #padded nuscene image: 928*1600 + (h, w) = img_metas[0]['pad_shape'] + pt_cam[..., 0] /= w + pt_cam[..., 1] /= h + # else: + # (h,w,_) = img_metas[0]['ori_shape'][0] #waymo image + # pt_cam[..., 0] /= w #cam0~2: 1280*1920 + # pt_cam[..., 1] /= h #cam3~4: 886 *1920 padded to 1280*1920 + # mask[:, 3:5, :] &= (pt_cam[:, 3:5, :, 1:2] < 0.7) #filter pt_cam_y > 886 + + mask = (mask & (pt_cam[..., 0:1] > 0.0) + & (pt_cam[..., 0:1] < 1.0) + & (pt_cam[..., 1:2] > 0.0) + & (pt_cam[..., 1:2] < 1.0)) + #(B num_cam num_q) -> (B 1 num_q num_cam 1 1) mask = mask.view(B, num_cam, 1, num_query, 1, 1).permute(0, 2, 3, 1, 4, 5) mask = torch.nan_to_num(mask) + + pt_cam = (pt_cam - 0.5) * 2 #[0,1] to [-1,1] to do grid_sample sampled_feats = [] for lvl, feat in enumerate(mlvl_feats): B, N, C, H, W = feat.size() - feat = feat.view(B*N, C, H, W) - reference_points_cam_lvl = reference_points_cam.view(B*N, num_query, 1, 2) - sampled_feat = F.grid_sample(feat, reference_points_cam_lvl) - sampled_feat = sampled_feat.view(B, N, C, num_query, 1).permute(0, 2, 3, 1, 4) + feat = feat.view(B * N, C, H, W) + pt_cam_lvl = pt_cam.view(B * N, num_query, 1, 2) + sampled_feat = F.grid_sample(feat, pt_cam_lvl) + #(B num_cam C num_query 1) -> List of (B C num_q num_cam 1) + sampled_feat = sampled_feat.view(B, N, C, num_query, 1) + sampled_feat = sampled_feat.permute(0, 2, 3, 1, 4) sampled_feats.append(sampled_feat) + sampled_feats = torch.stack(sampled_feats, -1) - sampled_feats = sampled_feats.view(B, C, num_query, num_cam, 1, len(mlvl_feats)) - return reference_points_3d, sampled_feats, mask + # (B C num_q num_cam fpn_lvl) + sampled_feats = \ + sampled_feats.view(B, C, num_query, num_cam, 1, len(mlvl_feats)) + return ref_pt_3d, sampled_feats, mask diff --git a/projects/mmdet3d_plugin/models/utils/dgcnn_attn.py b/projects/mmdet3d_plugin/models/utils/dgcnn_attn.py deleted file mode 100755 index 5cfd459..0000000 --- a/projects/mmdet3d_plugin/models/utils/dgcnn_attn.py +++ /dev/null @@ -1,96 +0,0 @@ -import math - -import torch -import torch.nn as nn -from mmcv.cnn.bricks.registry import ATTENTION -from mmcv.runner.base_module import BaseModule - - -@ATTENTION.register_module() -class DGCNNAttn(BaseModule): - """A warpper for DGCNN-type self-attention. - Args: - embed_dims (int): The embedding dimension. - num_heads (int): Parallel attention heads. Same as - `nn.MultiheadAttention`. - dropout (float):w A Dropout layer on attn_output_weights. Default: 0.. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - """ - - def __init__(self, - embed_dims, - num_heads, - dropout=0., - init_cfg=None, - **kwargs): - super(DGCNNAttn, self).__init__(init_cfg) - self.embed_dims = embed_dims - self.num_heads = num_heads - self.dropout = dropout - self.conv1 = nn.Sequential(nn.Conv2d(self.embed_dims*2, self.embed_dims, kernel_size=1, bias=False), - nn.BatchNorm2d(self.embed_dims), - nn.ReLU(inplace=True)) - self.conv2 = nn.Sequential(nn.Conv2d(self.embed_dims*2, self.embed_dims, kernel_size=1, bias=False), - nn.BatchNorm2d(self.embed_dims), - nn.ReLU(inplace=True)) - self.K = kwargs['K'] - self.dropout = nn.Dropout(dropout) - - def forward(self, - query, - key=None, - value=None, - residual=None, - query_pos=None, - key_pos=None, - attn_mask=None, - key_padding_mask=None, - **kwargs): - """Forward function for `DGCNN`. - **kwargs allow passing a more general data flow when combining - with other operations in `DGCNN`. - Args: - query (Tensor): The input query with shape [num_queries, bs, - embed_dims]. Same in `nn.MultiheadAttention.forward`. - residual (Tensor): This tensor, with the same shape as x, - will be used for the residual link. - If None, `x` will be used. Defaults to None. - query_pos (Tensor): The positional encoding for query, with - the same shape as `x`. If not None, it will - be added to `x` before forward function. Defaults to None. - Returns: - Tensor: forwarded results with shape [num_queries, bs, embed_dims]. - """ - if residual is None: - residual = query - if query_pos is not None: - query = query + query_pos - - query = query.permute(1, 0, 2) # [bs, num_queries, embed_dims] - edge_feats = self.edge_feats(query, K=self.K) - edge_feats1 = self.conv1(edge_feats) - edge_feats1 = edge_feats1.max(dim=-1)[0] - out = edge_feats1 - edge_feats1 = self.edge_feats(edge_feats1.permute(0, 2, 1)) - edge_feats2 = self.conv2(edge_feats1) - edge_feats2 = edge_feats2.max(dim=-1)[0] - out = out + edge_feats2 - out = out.permute(2, 0, 1) - return residual + self.dropout(out) - - def edge_feats(self, query, K=16): - # (B, N, N) - affinity = torch.cdist(query, query) - # (B, N, K) - _, topk = torch.topk(affinity, k=K, dim=2) - B, N, C = query.size() - - idx_base = torch.arange(0, B, device=query.device).view(-1, 1, 1) * N - idx = topk + idx_base - idx = idx.view(-1) - query = query.reshape(B*N, C) - query_neighbor = query[idx, :].view(B, N, K, C) - query = query.reshape(B, N, 1, C).repeat(1, 1, K, 1) - out = torch.cat((query_neighbor, query), dim=-1).permute(0, 3, 1, 2).contiguous() - return out diff --git a/projects/mmdet3d_plugin/models/utils/grid_mask.py b/projects/mmdet3d_plugin/models/utils/grid_mask.py index fe05ca8..f57bb50 100755 --- a/projects/mmdet3d_plugin/models/utils/grid_mask.py +++ b/projects/mmdet3d_plugin/models/utils/grid_mask.py @@ -1,16 +1,25 @@ +import numpy as np import torch import torch.nn as nn -import numpy as np from PIL import Image + class Grid(object): - def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): + + def __init__(self, + use_h, + use_w, + rotate=1, + offset=False, + ratio=0.5, + mode=0, + prob=1.): self.use_h = use_h self.use_w = use_w self.rotate = rotate self.offset = offset self.ratio = ratio - self.mode=mode + self.mode = mode self.st_prob = prob self.prob = prob @@ -24,50 +33,59 @@ def __call__(self, img, label): w = img.size(2) self.d1 = 2 self.d2 = min(h, w) - hh = int(1.5*h) - ww = int(1.5*w) + hh = int(1.5 * h) + ww = int(1.5 * w) d = np.random.randint(self.d1, self.d2) if self.ratio == 1: self.l = np.random.randint(1, d) else: - self.l = min(max(int(d*self.ratio+0.5),1),d-1) + self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1) mask = np.ones((hh, ww), np.float32) st_h = np.random.randint(d) st_w = np.random.randint(d) if self.use_h: - for i in range(hh//d): - s = d*i + st_h - t = min(s+self.l, hh) - mask[s:t,:] *= 0 + for i in range(hh // d): + s = d * i + st_h + t = min(s + self.l, hh) + mask[s:t, :] *= 0 if self.use_w: - for i in range(ww//d): - s = d*i + st_w - t = min(s+self.l, ww) - mask[:,s:t] *= 0 - + for i in range(ww // d): + s = d * i + st_w + t = min(s + self.l, ww) + mask[:, s:t] *= 0 + r = np.random.randint(self.rotate) mask = Image.fromarray(np.uint8(mask)) mask = mask.rotate(r) mask = np.asarray(mask) - mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] + mask = mask[(hh - h) // 2:(hh - h) // 2 + h, + (ww - w) // 2:(ww - w) // 2 + w] mask = torch.from_numpy(mask).float() if self.mode == 1: - mask = 1-mask + mask = 1 - mask mask = mask.expand_as(img) if self.offset: - offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float() + offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float() offset = (1 - mask) * offset img = img * mask + offset else: - img = img * mask + img = img * mask return img, label class GridMask(nn.Module): - def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): + + def __init__(self, + use_h, + use_w, + rotate=1, + offset=False, + ratio=0.5, + mode=0, + prob=1.): super(GridMask, self).__init__() self.use_h = use_h self.use_w = use_w @@ -79,45 +97,47 @@ def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, self.prob = prob def set_prob(self, epoch, max_epoch): - self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5 + self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5 def forward(self, x): if np.random.rand() > self.prob or not self.training: return x - n,c,h,w = x.size() - x = x.view(-1,h,w) - hh = int(1.5*h) - ww = int(1.5*w) + n, c, h, w = x.size() + x = x.view(-1, h, w) + hh = int(1.5 * h) + ww = int(1.5 * w) d = np.random.randint(2, h) - self.l = min(max(int(d*self.ratio+0.5),1),d-1) + self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1) mask = np.ones((hh, ww), np.float32) st_h = np.random.randint(d) st_w = np.random.randint(d) if self.use_h: - for i in range(hh//d): - s = d*i + st_h - t = min(s+self.l, hh) - mask[s:t,:] *= 0 + for i in range(hh // d): + s = d * i + st_h + t = min(s + self.l, hh) + mask[s:t, :] *= 0 if self.use_w: - for i in range(ww//d): - s = d*i + st_w - t = min(s+self.l, ww) - mask[:,s:t] *= 0 - + for i in range(ww // d): + s = d * i + st_w + t = min(s + self.l, ww) + mask[:, s:t] *= 0 + r = np.random.randint(self.rotate) mask = Image.fromarray(np.uint8(mask)) mask = mask.rotate(r) mask = np.asarray(mask) - mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] + mask = mask[(hh - h) // 2:(hh - h) // 2 + h, + (ww - w) // 2:(ww - w) // 2 + w] mask = torch.from_numpy(mask).float().cuda() if self.mode == 1: - mask = 1-mask + mask = 1 - mask mask = mask.expand_as(x) if self.offset: - offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float().cuda() + offset = torch.from_numpy( + 2 * (np.random.rand(h, w) - 0.5)).float().cuda() x = x * mask + offset * (1 - mask) else: - x = x * mask + x = x * mask - return x.view(n,c,h,w) \ No newline at end of file + return x.view(n, c, h, w) From 470d9a6aa83765635c3b259a12495c6ed63c3536 Mon Sep 17 00:00:00 2001 From: zltjohn <1163842861@qq.com> Date: Thu, 22 Dec 2022 11:12:15 +0800 Subject: [PATCH 2/2] correct 'config' to 'configs' --- .../detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py | 0 projects/{config => configs}/detr3d_res101_gridmask_dev-1.x.py | 0 .../detr3d_res101_gridmask_dev-1.x_test_old.py | 0 ...3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename projects/{config => configs}/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py (100%) rename projects/{config => configs}/detr3d_res101_gridmask_dev-1.x.py (100%) rename projects/{config => configs}/detr3d_res101_gridmask_dev-1.x_test_old.py (100%) rename projects/{config => configs}/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py (100%) diff --git a/projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py b/projects/configs/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py similarity index 100% rename from projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py rename to projects/configs/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py diff --git a/projects/config/detr3d_res101_gridmask_dev-1.x.py b/projects/configs/detr3d_res101_gridmask_dev-1.x.py similarity index 100% rename from projects/config/detr3d_res101_gridmask_dev-1.x.py rename to projects/configs/detr3d_res101_gridmask_dev-1.x.py diff --git a/projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py b/projects/configs/detr3d_res101_gridmask_dev-1.x_test_old.py similarity index 100% rename from projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py rename to projects/configs/detr3d_res101_gridmask_dev-1.x_test_old.py diff --git a/projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py b/projects/configs/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py similarity index 100% rename from projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py rename to projects/configs/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py