From b4fdead39fdbd6cd1c8352857fdea4af69116c2d Mon Sep 17 00:00:00 2001
From: zltjohn <1163842861@qq.com>
Date: Thu, 22 Dec 2022 10:49:44 +0800
Subject: [PATCH 1/2] support detr3d in mmdet3d-dev-1.x (v1.1)

---
 mmdetection3d                                 |   2 +-
 ...d_res101_gridmask_cbgs_dev-1.x_test_old.py | 285 +++++++++++
 .../config/detr3d_res101_gridmask_dev-1.x.py  | 241 +++++++++
 ...detr3d_res101_gridmask_dev-1.x_test_old.py | 283 ++++++++++
 ...et_final_trainval_cbgs_dev-1.x_test_old.py | 279 ++++++++++
 .../configs/detr3d/detr3d_res101_gridmask.py  | 225 --------
 .../detr3d/detr3d_res101_gridmask_cbgs.py     | 228 ---------
 ...vovnet_gridmask_det_final_trainval_cbgs.py | 227 --------
 projects/configs/obj_dgcnn/pillar.py          | 295 -----------
 projects/configs/obj_dgcnn/voxel.py           | 293 -----------
 projects/mmdet3d_plugin/__init__.py           |  23 +-
 .../core/bbox/assigners/__init__.py           |   3 -
 .../core/bbox/coders/__init__.py              |   3 -
 .../core/bbox/match_costs/__init__.py         |   4 -
 .../core/bbox/match_costs/match_cost.py       |  27 -
 projects/mmdet3d_plugin/core/bbox/util.py     |  53 --
 projects/mmdet3d_plugin/datasets/__init__.py  |   6 +-
 .../mmdet3d_plugin/datasets/custom_nus.py     |  19 +
 .../datasets/nuscenes_dataset.py              |  76 ---
 .../datasets/pipelines/__init__.py            |  11 -
 .../datasets/pipelines/transform_3d.py        | 306 -----------
 .../datasets/transforms/__init__.py           |   1 +
 .../datasets/transforms/transform_3d.py       | 218 ++++++++
 .../models/backbones/__init__.py              |   2 -
 .../mmdet3d_plugin/models/backbones/vovnet.py | 306 ++++++-----
 .../models/dense_heads/__init__.py            |   3 -
 .../models/dense_heads/detr3d_head.py         | 438 ++++++++--------
 .../models/dense_heads/dgcnn3d_head.py        | 484 ------------------
 .../models/detectors/__init__.py              |   4 +-
 .../mmdet3d_plugin/models/detectors/detr3d.py | 362 +++++++------
 .../models/detectors/detr3d_old.py            |  40 ++
 .../models/detectors/obj_dgcnn.py             | 187 -------
 .../models/task_modules/__init__.py           |   3 +
 .../task_modules}/hungarian_assigner_3d.py    |  98 ++--
 .../models/task_modules/match_cost.py         |  32 ++
 .../task_modules}/nms_free_coder.py           |  51 +-
 .../models/task_modules/util.py               |  76 +++
 .../mmdet3d_plugin/models/utils/__init__.py   |   8 +-
 projects/mmdet3d_plugin/models/utils/detr.py  | 106 ----
 .../models/utils/detr3d_transformer.py        | 287 ++++++-----
 .../mmdet3d_plugin/models/utils/dgcnn_attn.py |  96 ----
 .../mmdet3d_plugin/models/utils/grid_mask.py  | 100 ++--
 42 files changed, 2390 insertions(+), 3401 deletions(-)
 create mode 100644 projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py
 create mode 100644 projects/config/detr3d_res101_gridmask_dev-1.x.py
 create mode 100644 projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py
 create mode 100644 projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py
 delete mode 100644 projects/configs/detr3d/detr3d_res101_gridmask.py
 delete mode 100755 projects/configs/detr3d/detr3d_res101_gridmask_cbgs.py
 delete mode 100755 projects/configs/detr3d/detr3d_vovnet_gridmask_det_final_trainval_cbgs.py
 delete mode 100755 projects/configs/obj_dgcnn/pillar.py
 delete mode 100755 projects/configs/obj_dgcnn/voxel.py
 mode change 100755 => 100644 projects/mmdet3d_plugin/__init__.py
 delete mode 100755 projects/mmdet3d_plugin/core/bbox/assigners/__init__.py
 delete mode 100755 projects/mmdet3d_plugin/core/bbox/coders/__init__.py
 delete mode 100755 projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py
 delete mode 100755 projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py
 delete mode 100755 projects/mmdet3d_plugin/core/bbox/util.py
 create mode 100644 projects/mmdet3d_plugin/datasets/custom_nus.py
 delete mode 100644 projects/mmdet3d_plugin/datasets/nuscenes_dataset.py
 delete mode 100755 projects/mmdet3d_plugin/datasets/pipelines/__init__.py
 delete mode 100755 projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
 create mode 100644 projects/mmdet3d_plugin/datasets/transforms/__init__.py
 create mode 100644 projects/mmdet3d_plugin/datasets/transforms/transform_3d.py
 mode change 100755 => 100644 projects/mmdet3d_plugin/models/dense_heads/detr3d_head.py
 delete mode 100755 projects/mmdet3d_plugin/models/dense_heads/dgcnn3d_head.py
 mode change 100755 => 100644 projects/mmdet3d_plugin/models/detectors/detr3d.py
 create mode 100644 projects/mmdet3d_plugin/models/detectors/detr3d_old.py
 delete mode 100755 projects/mmdet3d_plugin/models/detectors/obj_dgcnn.py
 create mode 100644 projects/mmdet3d_plugin/models/task_modules/__init__.py
 rename projects/mmdet3d_plugin/{core/bbox/assigners => models/task_modules}/hungarian_assigner_3d.py (63%)
 create mode 100755 projects/mmdet3d_plugin/models/task_modules/match_cost.py
 rename projects/mmdet3d_plugin/{core/bbox/coders => models/task_modules}/nms_free_coder.py (73%)
 create mode 100755 projects/mmdet3d_plugin/models/task_modules/util.py
 delete mode 100755 projects/mmdet3d_plugin/models/utils/detr.py
 delete mode 100755 projects/mmdet3d_plugin/models/utils/dgcnn_attn.py

diff --git a/mmdetection3d b/mmdetection3d
index 60ce864..9a425a1 160000
--- a/mmdetection3d
+++ b/mmdetection3d
@@ -1 +1 @@
-Subproject commit 60ce864ff76af4316fb9ae56a2a5b7741bfdd9ab
+Subproject commit 9a425a1109d29e808ab8e2d6ddaff2ae004cfa56
diff --git a/projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py b/projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py
new file mode 100644
index 0000000..fc7d88f
--- /dev/null
+++ b/projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py
@@ -0,0 +1,285 @@
+_base_ = [
+    # '.../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+    '../../mmdetection3d/configs/_base_/default_runtime.py'
+]
+
+custom_imports = dict(imports=['projects.mmdet3d_plugin'])
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675],
+                    std=[1.0, 1.0, 1.0],
+                    bgr_to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(use_lidar=False,
+                      use_camera=True,
+                      use_radar=False,
+                      use_map=False,
+                      use_external=False)
+# this means type='Detr3D' will be processed as 'mmdet3d.Detr3D'
+default_scope = 'mmdet3d'
+model = dict(
+    type='Detr3D_old',
+    use_grid_mask=True,
+    data_preprocessor=dict(type='Det3DDataPreprocessor',
+                           **img_norm_cfg,
+                           pad_size_divisor=32),
+    img_backbone=dict(type='mmdet.ResNet',
+                      depth=101,
+                      num_stages=4,
+                      out_indices=(0, 1, 2, 3),
+                      frozen_stages=1,
+                      norm_cfg=dict(type='BN2d', requires_grad=False),
+                      norm_eval=True,
+                      style='caffe',
+                      dcn=dict(type='DCNv2',
+                               deform_groups=1,
+                               fallback_on_stride=False),
+                      stage_with_dcn=(False, False, True, True)),
+    img_neck=dict(type='mmdet.FPN',
+                  in_channels=[256, 512, 1024, 2048],
+                  out_channels=256,
+                  start_level=1,
+                  add_extra_convs='on_output',
+                  num_outs=4,
+                  relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='Detr3DHead',
+        num_query=900,
+        num_classes=10,
+        in_channels=256,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='Detr3DTransformer',
+            decoder=dict(
+                type='Detr3DTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='mmdet.DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',  # mmcv.
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(type='Detr3DCrossAtten',
+                             pc_range=point_cloud_range,
+                             num_points=1,
+                             embed_dims=256)
+                    ],
+                    feedforward_channels=512,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(type='mmdet.SinePositionalEncoding',
+                                 num_feats=128,
+                                 normalize=True,
+                                 offset=-0.5),
+        loss_cls=dict(type='mmdet.FocalLoss',
+                      use_sigmoid=True,
+                      gamma=2.0,
+                      alpha=0.25,
+                      loss_weight=2.0),
+        loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # ↓ Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='mmdet.IoUCost', weight=0.0),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nus_v2/'
+
+test_transforms = [
+    dict(type='RandomResize3D',
+         scale=(1600, 900),
+         ratio_range=(1., 1.),
+         keep_ratio=True)
+]
+train_transforms = test_transforms
+
+file_client_args = dict(backend='disk')
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D',
+         with_bbox_3d=True,
+         with_label_3d=True,
+         with_attr_label=False),
+    dict(type='MultiViewWrapper', transforms=train_transforms),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6),
+    dict(type='MultiViewWrapper', transforms=test_transforms),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+metainfo = dict(classes=class_names)
+data_prefix = dict(pts='',
+                   CAM_FRONT='samples/CAM_FRONT',
+                   CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+                   CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+                   CAM_BACK='samples/CAM_BACK',
+                   CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+                   CAM_BACK_LEFT='samples/CAM_BACK_LEFT')
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CBGSDataset',
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='nuscenes_infos_train.pkl',
+            pipeline=train_pipeline,
+            load_type='frame_based',
+            metainfo=metainfo,
+            modality=input_modality,
+            test_mode=False,
+            data_prefix=data_prefix,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')))
+
+val_dataloader = dict(batch_size=1,
+                      num_workers=4,
+                      persistent_workers=True,
+                      drop_last=False,
+                      sampler=dict(type='DefaultSampler', shuffle=False),
+                      dataset=dict(type=dataset_type,
+                                   data_root=data_root,
+                                   ann_file='nuscenes_infos_val.pkl',
+                                   load_type='frame_based',
+                                   pipeline=test_pipeline,
+                                   metainfo=metainfo,
+                                   modality=input_modality,
+                                   test_mode=True,
+                                   data_prefix=data_prefix,
+                                   box_type_3d='LiDAR'))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='NuScenesMetric',
+                     data_root=data_root,
+                     ann_file=data_root + 'nuscenes_infos_val.pkl',
+                     metric='bbox')
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01),
+    paramwise_cfg=dict(custom_keys={'img_backbone': dict(lr_mult=0.1)}),
+    clip_grad=dict(max_norm=35, norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(type='LinearLR',
+         start_factor=1.0 / 3,
+         by_epoch=False,
+         begin=0,
+         end=500),
+    dict(type='CosineAnnealingLR',
+         by_epoch=True,
+         begin=0,
+         end=24,
+         T_max=24,
+         eta_min_ratio=1e-3)
+]
+
+total_epochs = 24
+
+train_cfg = dict(type='EpochBasedTrainLoop',
+                 max_epochs=total_epochs,
+                 val_interval=2)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+default_hooks = dict(checkpoint=dict(
+    type='CheckpointHook', interval=1, max_keep_ckpts=1, save_last=True))
+load_from = 'ckpts/fcos3d_yue.pth'
+
+vis_backends = [dict(type='TensorboardVisBackend')]
+visualizer = dict(type='Det3DLocalVisualizer',
+                  vis_backends=vis_backends,
+                  name='visualizer')
+
+# before fixing h,w bug in feature-sampling
+# mAP: 0.3405
+# mATE: 0.7516
+# mASE: 0.2688
+# mAOE: 0.3750
+# mAVE: 0.8621
+# mAAE: 0.2080
+# NDS: 0.4237
+# Eval time: 124.2s
+
+# Per-class results:
+# Object Class    AP      ATE     ASE     AOE     AVE     AAE
+# car     0.513   0.590   0.153   0.066   0.932   0.197
+# truck   0.280   0.771   0.206   0.092   1.001   0.237
+# bus     0.356   0.860   0.193   0.131   1.826   0.370
+# trailer 0.179   1.110   0.234   0.564   0.950   0.159
+# construction_vehicle    0.080   0.988   0.444   0.947   0.120   0.330
+# pedestrian      0.397   0.683   0.305   0.532   0.469   0.200
+# motorcycle      0.338   0.712   0.255   0.359   1.194   0.158
+# bicycle 0.256   0.657   0.284   0.583   0.405   0.014
+# traffic_cone    0.508   0.542   0.323   nan     nan     nan
+# barrier 0.501   0.602   0.292   0.100   nan     nan
+
+# after fixing h,w bug in feature-sampling
+# mAP: 0.3493
+# mATE: 0.7162
+# mASE: 0.2682
+# mAOE: 0.3795
+# mAVE: 0.8417
+# mAAE: 0.1996
+# NDS: 0.4341
+# Eval time: 128.7s
+
+# Per-class results:
+# Object Class    AP      ATE     ASE     AOE     AVE     AAE
+# car     0.542   0.533   0.151   0.064   0.954   0.193
+# truck   0.285   0.774   0.208   0.093   1.016   0.239
+# bus     0.363   0.796   0.192   0.137   1.842   0.379
+# trailer 0.167   1.075   0.236   0.610   0.718   0.094
+# construction_vehicle    0.081   0.970   0.438   0.914   0.114   0.337
+# pedestrian      0.400   0.684   0.306   0.531   0.474   0.201
+# motorcycle      0.337   0.684   0.257   0.383   1.203   0.143
+# bicycle 0.261   0.631   0.280   0.582   0.411   0.012
+# traffic_cone    0.531   0.478   0.324   nan     nan     nan
+# barrier 0.525   0.536   0.291   0.102   nan     nan
diff --git a/projects/config/detr3d_res101_gridmask_dev-1.x.py b/projects/config/detr3d_res101_gridmask_dev-1.x.py
new file mode 100644
index 0000000..2c14d6f
--- /dev/null
+++ b/projects/config/detr3d_res101_gridmask_dev-1.x.py
@@ -0,0 +1,241 @@
+_base_ = [
+    # '.../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+    '../../mmdetection3d/configs/_base_/default_runtime.py'
+]
+
+custom_imports = dict(imports=['projects.mmdet3d_plugin'])
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675],
+                    std=[1.0, 1.0, 1.0],
+                    bgr_to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(use_lidar=False,
+                      use_camera=True,
+                      use_radar=False,
+                      use_map=False,
+                      use_external=False)
+# this means type='Detr3D' will be processed as 'mmdet3d.Detr3D'
+default_scope = 'mmdet3d'
+model = dict(
+    type='Detr3D',
+    use_grid_mask=True,
+    data_preprocessor=dict(type='Det3DDataPreprocessor',
+                           **img_norm_cfg,
+                           pad_size_divisor=32),
+    img_backbone=dict(type='mmdet.ResNet',
+                      depth=101,
+                      num_stages=4,
+                      out_indices=(0, 1, 2, 3),
+                      frozen_stages=1,
+                      norm_cfg=dict(type='BN2d', requires_grad=False),
+                      norm_eval=True,
+                      style='caffe',
+                      dcn=dict(type='DCNv2',
+                               deform_groups=1,
+                               fallback_on_stride=False),
+                      stage_with_dcn=(False, False, True, True)),
+    img_neck=dict(type='mmdet.FPN',
+                  in_channels=[256, 512, 1024, 2048],
+                  out_channels=256,
+                  start_level=1,
+                  add_extra_convs='on_output',
+                  num_outs=4,
+                  relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='Detr3DHead',
+        num_query=900,
+        num_classes=10,
+        in_channels=256,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='Detr3DTransformer',
+            decoder=dict(
+                type='Detr3DTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='mmdet.DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',  # mmcv.
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(type='Detr3DCrossAtten',
+                             pc_range=point_cloud_range,
+                             num_points=1,
+                             embed_dims=256)
+                    ],
+                    feedforward_channels=512,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(type='mmdet.SinePositionalEncoding',
+                                 num_feats=128,
+                                 normalize=True,
+                                 offset=-0.5),
+        loss_cls=dict(type='mmdet.FocalLoss',
+                      use_sigmoid=True,
+                      gamma=2.0,
+                      alpha=0.25,
+                      loss_weight=2.0),
+        loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # ↓ Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='mmdet.IoUCost', weight=0.0),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nus_v2/'
+
+test_transforms = [
+    dict(type='RandomResize3D',
+         scale=(1600, 900),
+         ratio_range=(1., 1.),
+         keep_ratio=True)
+]
+train_transforms = test_transforms
+
+file_client_args = dict(backend='disk')
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D',
+         with_bbox_3d=True,
+         with_label_3d=True,
+         with_attr_label=False),
+    dict(type='MultiViewWrapper', transforms=train_transforms),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6),
+    dict(type='MultiViewWrapper', transforms=test_transforms),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+metainfo = dict(classes=class_names)
+data_prefix = dict(pts='',
+                   CAM_FRONT='samples/CAM_FRONT',
+                   CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+                   CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+                   CAM_BACK='samples/CAM_BACK',
+                   CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+                   CAM_BACK_LEFT='samples/CAM_BACK_LEFT')
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        load_type='frame_based',
+        metainfo=metainfo,
+        modality=input_modality,
+        test_mode=False,
+        data_prefix=data_prefix,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'))
+
+val_dataloader = dict(batch_size=1,
+                      num_workers=4,
+                      persistent_workers=True,
+                      drop_last=False,
+                      sampler=dict(type='DefaultSampler', shuffle=False),
+                      dataset=dict(type=dataset_type,
+                                   data_root=data_root,
+                                   ann_file='nuscenes_infos_val.pkl',
+                                   load_type='frame_based',
+                                   pipeline=test_pipeline,
+                                   metainfo=metainfo,
+                                   modality=input_modality,
+                                   test_mode=True,
+                                   data_prefix=data_prefix,
+                                   box_type_3d='LiDAR'))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='NuScenesMetric',
+                     data_root=data_root,
+                     ann_file=data_root + 'nuscenes_infos_val.pkl',
+                     metric='bbox')
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01),
+    paramwise_cfg=dict(custom_keys={'img_backbone': dict(lr_mult=0.1)}),
+    clip_grad=dict(max_norm=35, norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(type='LinearLR',
+         start_factor=1.0 / 3,
+         by_epoch=False,
+         begin=0,
+         end=500),
+    dict(type='CosineAnnealingLR',
+         by_epoch=True,
+         begin=0,
+         end=24,
+         T_max=24,
+         eta_min_ratio=1e-3)
+]
+
+total_epochs = 24
+
+train_cfg = dict(type='EpochBasedTrainLoop',
+                 max_epochs=total_epochs,
+                 val_interval=2)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+default_hooks = dict(checkpoint=dict(
+    type='CheckpointHook', interval=1, max_keep_ckpts=1, save_last=True))
+load_from = 'ckpts/fcos3d_yue.pth'
+
+# ERROR: pip's dependency resolver does not currently take into account all the packages that are installed.
+# This behaviour is the source of the following dependency conflicts.
+# jupyter-packaging 0.12.3 requires setuptools>=60.2.0, but you have setuptools 58.0.4 which is incompatible.
+# setuptools 65 downgrades to 58.In mmlab-node we use setuptools 61 but occurs NO errors
+vis_backends = [dict(type='TensorboardVisBackend')]
+visualizer = dict(type='Det3DLocalVisualizer',
+                  vis_backends=vis_backends,
+                  name='visualizer')
diff --git a/projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py b/projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py
new file mode 100644
index 0000000..d48b6d0
--- /dev/null
+++ b/projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py
@@ -0,0 +1,283 @@
+_base_ = [
+    # '.../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+    '../../mmdetection3d/configs/_base_/default_runtime.py'
+]
+
+custom_imports = dict(imports=['projects.mmdet3d_plugin'])
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675],
+                    std=[1.0, 1.0, 1.0],
+                    bgr_to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(use_lidar=False,
+                      use_camera=True,
+                      use_radar=False,
+                      use_map=False,
+                      use_external=False)
+# this means type='Detr3D' will be processed as 'mmdet3d.Detr3D'
+default_scope = 'mmdet3d'
+model = dict(
+    type='Detr3D_old',
+    use_grid_mask=True,
+    data_preprocessor=dict(type='Det3DDataPreprocessor',
+                           **img_norm_cfg,
+                           pad_size_divisor=32),
+    img_backbone=dict(type='mmdet.ResNet',
+                      depth=101,
+                      num_stages=4,
+                      out_indices=(0, 1, 2, 3),
+                      frozen_stages=1,
+                      norm_cfg=dict(type='BN2d', requires_grad=False),
+                      norm_eval=True,
+                      style='caffe',
+                      dcn=dict(type='DCNv2',
+                               deform_groups=1,
+                               fallback_on_stride=False),
+                      stage_with_dcn=(False, False, True, True)),
+    img_neck=dict(type='mmdet.FPN',
+                  in_channels=[256, 512, 1024, 2048],
+                  out_channels=256,
+                  start_level=1,
+                  add_extra_convs='on_output',
+                  num_outs=4,
+                  relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='Detr3DHead',
+        num_query=900,
+        num_classes=10,
+        in_channels=256,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='Detr3DTransformer',
+            decoder=dict(
+                type='Detr3DTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='mmdet.DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',  #mmcv.
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(type='Detr3DCrossAtten',
+                             pc_range=point_cloud_range,
+                             num_points=1,
+                             embed_dims=256)
+                    ],
+                    feedforward_channels=512,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(type='mmdet.SinePositionalEncoding',
+                                 num_feats=128,
+                                 normalize=True,
+                                 offset=-0.5),
+        loss_cls=dict(type='mmdet.FocalLoss',
+                      use_sigmoid=True,
+                      gamma=2.0,
+                      alpha=0.25,
+                      loss_weight=2.0),
+        loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # ↓ Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='mmdet.IoUCost', weight=0.0),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nus_v2/'
+
+test_transforms = [
+    dict(type='RandomResize3D',
+         scale=(1600, 900),
+         ratio_range=(1., 1.),
+         keep_ratio=True)
+]
+train_transforms = test_transforms
+
+file_client_args = dict(backend='disk')
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D',
+         with_bbox_3d=True,
+         with_label_3d=True,
+         with_attr_label=False),
+    dict(type='MultiViewWrapper', transforms=train_transforms),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6),
+    dict(type='MultiViewWrapper', transforms=test_transforms),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+metainfo = dict(classes=class_names)
+data_prefix = dict(pts='',
+                   CAM_FRONT='samples/CAM_FRONT',
+                   CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+                   CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+                   CAM_BACK='samples/CAM_BACK',
+                   CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+                   CAM_BACK_LEFT='samples/CAM_BACK_LEFT')
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        load_type='frame_based',
+        metainfo=metainfo,
+        modality=input_modality,
+        test_mode=False,
+        data_prefix=data_prefix,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'))
+
+val_dataloader = dict(batch_size=1,
+                      num_workers=4,
+                      persistent_workers=True,
+                      drop_last=False,
+                      sampler=dict(type='DefaultSampler', shuffle=False),
+                      dataset=dict(type=dataset_type,
+                                   data_root=data_root,
+                                   ann_file='nuscenes_infos_val.pkl',
+                                   load_type='frame_based',
+                                   pipeline=test_pipeline,
+                                   metainfo=metainfo,
+                                   modality=input_modality,
+                                   test_mode=True,
+                                   data_prefix=data_prefix,
+                                   box_type_3d='LiDAR'))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='NuScenesMetric',
+                     data_root=data_root,
+                     ann_file=data_root + 'nuscenes_infos_val.pkl',
+                     metric='bbox')
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01),
+    paramwise_cfg=dict(custom_keys={'img_backbone': dict(lr_mult=0.1)}),
+    clip_grad=dict(max_norm=35, norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(type='LinearLR',
+         start_factor=1.0 / 3,
+         by_epoch=False,
+         begin=0,
+         end=500),
+    dict(type='CosineAnnealingLR',
+         by_epoch=True,
+         begin=0,
+         end=24,
+         T_max=24,
+         eta_min_ratio=1e-3)
+]
+
+total_epochs = 24
+
+train_cfg = dict(type='EpochBasedTrainLoop',
+                 max_epochs=total_epochs,
+                 val_interval=2)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+default_hooks = dict(checkpoint=dict(
+    type='CheckpointHook', interval=1, max_keep_ckpts=1, save_last=True))
+load_from = 'ckpts/fcos3d_yue.pth'
+
+vis_backends = [dict(type='TensorboardVisBackend')]
+visualizer = dict(type='Det3DLocalVisualizer',
+                  vis_backends=vis_backends,
+                  name='visualizer')
+
+# before fixing h,w bug in feature-sampling
+# mAP: 0.3450
+# mATE: 0.7740
+# mASE: 0.2675
+# mAOE: 0.3960
+# mAVE: 0.8737
+# mAAE: 0.2156
+# NDS: 0.4198
+# Eval time: 161.5s
+
+# Per-class results:
+# Object Class    AP      ATE     ASE     AOE     AVE     AAE
+# car     0.534   0.565   0.152   0.071   0.907   0.214
+# truck   0.285   0.839   0.213   0.114   0.984   0.229
+# bus     0.346   0.924   0.199   0.117   2.060   0.379
+# trailer 0.166   1.108   0.230   0.551   0.734   0.126
+# construction_vehicle    0.082   1.057   0.446   1.013   0.125   0.387
+# pedestrian      0.426   0.688   0.294   0.508   0.459   0.195
+# motorcycle      0.343   0.696   0.260   0.475   1.268   0.180
+# bicycle 0.275   0.691   0.275   0.578   0.452   0.015
+# traffic_cone    0.521   0.555   0.314   nan     nan     nan
+# barrier 0.473   0.619   0.293   0.138   nan     nan
+
+# after fixing h,w bug in feature-sampling
+# mAP: 0.3469
+# mATE: 0.7651
+# mASE: 0.2678
+# mAOE: 0.3916
+# mAVE: 0.8758
+# mAAE: 0.2110
+# NDS: 0.4223
+# Eval time: 117.2s
+
+# Per-class results:
+# Object Class    AP      ATE     ASE     AOE     AVE     AAE
+# car     0.546   0.544   0.152   0.070   0.911   0.208
+# truck   0.286   0.834   0.212   0.113   1.005   0.231
+# bus     0.346   0.870   0.196   0.116   2.063   0.383
+# trailer 0.167   1.106   0.233   0.549   0.687   0.093
+# construction_vehicle    0.082   1.060   0.449   0.960   0.120   0.384
+# pedestrian      0.424   0.700   0.295   0.512   0.462   0.194
+# motorcycle      0.340   0.709   0.259   0.489   1.288   0.176
+# bicycle 0.278   0.698   0.275   0.586   0.473   0.018
+# traffic_cone    0.529   0.526   0.313   nan     nan     nan
+# barrier 0.471   0.603   0.292   0.131   nan     nan
diff --git a/projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py b/projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py
new file mode 100644
index 0000000..942e719
--- /dev/null
+++ b/projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py
@@ -0,0 +1,279 @@
+_base_ = [
+    # '.../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+    '../../mmdetection3d/configs/_base_/default_runtime.py'
+]
+
+custom_imports = dict(imports=['projects.mmdet3d_plugin'])
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675],
+                    std=[57.375, 57.120, 58.395],
+                    bgr_to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(use_lidar=False,
+                      use_camera=True,
+                      use_radar=False,
+                      use_map=False,
+                      use_external=False)
+# this means type='Detr3D' will be processed as 'mmdet3d.Detr3D'
+default_scope = 'mmdet3d'
+model = dict(
+    type='Detr3D_old',
+    use_grid_mask=True,
+    data_preprocessor=dict(type='Det3DDataPreprocessor',
+                           **img_norm_cfg,
+                           pad_size_divisor=32),
+    img_backbone=dict(type='VoVNet',
+                      spec_name='V-99-eSE',
+                      norm_eval=True,
+                      frozen_stages=1,
+                      input_ch=3,
+                      out_features=['stage2', 'stage3', 'stage4', 'stage5']),
+    img_neck=dict(type='mmdet.FPN',
+                  in_channels=[256, 512, 768, 1024],
+                  out_channels=256,
+                  start_level=0,
+                  add_extra_convs='on_output',
+                  num_outs=4,
+                  relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='Detr3DHead',
+        num_query=900,
+        num_classes=10,
+        in_channels=256,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        transformer=dict(
+            type='Detr3DTransformer',
+            decoder=dict(
+                type='Detr3DTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='mmdet.DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',  # mmcv.
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(type='Detr3DCrossAtten',
+                             pc_range=point_cloud_range,
+                             num_points=1,
+                             embed_dims=256)
+                    ],
+                    feedforward_channels=512,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(type='mmdet.SinePositionalEncoding',
+                                 num_feats=128,
+                                 normalize=True,
+                                 offset=-0.5),
+        loss_cls=dict(type='mmdet.FocalLoss',
+                      use_sigmoid=True,
+                      gamma=2.0,
+                      alpha=0.25,
+                      loss_weight=2.0),
+        loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # ↓ Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='mmdet.IoUCost', weight=0.0),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nus_v2/'
+
+test_transforms = [
+    dict(type='RandomResize3D',
+         scale=(1600, 900),
+         ratio_range=(1., 1.),
+         keep_ratio=True)
+]
+train_transforms = test_transforms
+
+file_client_args = dict(backend='disk')
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D',
+         with_bbox_3d=True,
+         with_label_3d=True,
+         with_attr_label=False),
+    dict(type='MultiViewWrapper', transforms=train_transforms),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6),
+    dict(type='MultiViewWrapper', transforms=test_transforms),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+metainfo = dict(classes=class_names)
+data_prefix = dict(pts='',
+                   CAM_FRONT='samples/CAM_FRONT',
+                   CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+                   CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+                   CAM_BACK='samples/CAM_BACK',
+                   CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+                   CAM_BACK_LEFT='samples/CAM_BACK_LEFT')
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CBGSDataset',
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='nuscenes_infos_trainval.pkl',
+            pipeline=train_pipeline,
+            load_type='frame_based',
+            metainfo=metainfo,
+            modality=input_modality,
+            test_mode=False,
+            data_prefix=data_prefix,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')))
+
+val_dataloader = dict(batch_size=1,
+                      num_workers=4,
+                      persistent_workers=True,
+                      drop_last=False,
+                      sampler=dict(type='DefaultSampler', shuffle=False),
+                      dataset=dict(type=dataset_type,
+                                   data_root=data_root,
+                                   ann_file='nuscenes_infos_val.pkl',
+                                   load_type='frame_based',
+                                   pipeline=test_pipeline,
+                                   metainfo=metainfo,
+                                   modality=input_modality,
+                                   test_mode=True,
+                                   data_prefix=data_prefix,
+                                   box_type_3d='LiDAR'))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='NuScenesMetric',
+                     data_root=data_root,
+                     ann_file=data_root + 'nuscenes_infos_val.pkl',
+                     metric='bbox')
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01),
+    paramwise_cfg=dict(custom_keys={'img_backbone': dict(lr_mult=0.1)}),
+    clip_grad=dict(max_norm=35, norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(type='LinearLR',
+         start_factor=1.0 / 3,
+         by_epoch=False,
+         begin=0,
+         end=500),
+    dict(type='CosineAnnealingLR',
+         by_epoch=True,
+         begin=0,
+         end=24,
+         T_max=24,
+         eta_min_ratio=1e-3)
+]
+
+total_epochs = 24
+
+train_cfg = dict(type='EpochBasedTrainLoop',
+                 max_epochs=total_epochs,
+                 val_interval=2)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+default_hooks = dict(checkpoint=dict(
+    type='CheckpointHook', interval=1, max_keep_ckpts=1, save_last=True))
+load_from = 'ckpts/fcos3d_yue.pth'
+
+vis_backends = [dict(type='TensorboardVisBackend')]
+visualizer = dict(type='Det3DLocalVisualizer',
+                  vis_backends=vis_backends,
+                  name='visualizer')
+
+# before fixing h,w bug in feature-sampling
+# mAP: 0.7103
+# mATE: 0.5395
+# mASE: 0.1455
+# mAOE: 0.0719
+# mAVE: 0.2233
+# mAAE: 0.1862
+# NDS: 0.7385
+# Eval time: 107.3s
+# Per-class results:
+# Object Class    AP      ATE     ASE     AOE     AVE     AAE
+# car     0.706   0.569   0.116   0.033   0.261   0.202
+# truck   0.737   0.483   0.120   0.034   0.195   0.208
+# bus     0.760   0.463   0.108   0.028   0.296   0.240
+# trailer 0.739   0.453   0.124   0.042   0.138   0.147
+# construction_vehicle    0.710   0.513   0.178   0.085   0.139   0.329
+# pedestrian      0.715   0.510   0.205   0.203   0.248   0.138
+# motorcycle      0.692   0.560   0.149   0.089   0.357   0.218
+# bicycle 0.673   0.643   0.171   0.081   0.152   0.008
+# traffic_cone    0.691   0.569   0.172   nan     nan     nan
+# barrier 0.681   0.633   0.112   0.052   nan     nan
+
+# after fixing h,w bug in feature-sampling
+# mAP: 0.8348
+# mATE: 0.3225
+# mASE: 0.1417
+# mAOE: 0.0676
+# mAVE: 0.2204
+# mAAE: 0.1820
+# NDS: 0.8240
+# Eval time: 97.4s
+
+# Per-class results:
+# Object Class    AP      ATE     ASE     AOE     AVE     AAE
+# car     0.873   0.256   0.114   0.033   0.260   0.195
+# truck   0.833   0.327   0.115   0.033   0.191   0.216
+# bus     0.842   0.323   0.104   0.027   0.293   0.244
+# trailer 0.779   0.394   0.116   0.041   0.136   0.123
+# construction_vehicle    0.784   0.406   0.174   0.079   0.137   0.320
+# pedestrian      0.806   0.380   0.203   0.181   0.244   0.135
+# motorcycle      0.822   0.337   0.150   0.085   0.347   0.213
+# bicycle 0.871   0.271   0.169   0.079   0.154   0.009
+# traffic_cone    0.877   0.241   0.162   nan     nan     nan
+# barrier 0.861   0.289   0.110   0.050   nan     nan
diff --git a/projects/configs/detr3d/detr3d_res101_gridmask.py b/projects/configs/detr3d/detr3d_res101_gridmask.py
deleted file mode 100644
index a4ff267..0000000
--- a/projects/configs/detr3d/detr3d_res101_gridmask.py
+++ /dev/null
@@ -1,225 +0,0 @@
-_base_ = [
-    '../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
-    '../../../mmdetection3d/configs/_base_/default_runtime.py'
-]
-
-plugin=True
-plugin_dir='projects/mmdet3d_plugin/'
-
-# If point cloud range is changed, the models should also change their point
-# cloud range accordingly
-point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
-voxel_size = [0.2, 0.2, 8]
-
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-# For nuScenes we usually do 10-class detection
-class_names = [
-    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
-    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
-]
-
-input_modality = dict(
-    use_lidar=False,
-    use_camera=True,
-    use_radar=False,
-    use_map=False,
-    use_external=False)
-
-model = dict(
-    type='Detr3D',
-    use_grid_mask=True,
-    img_backbone=dict(
-        type='ResNet',
-        depth=101,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        norm_cfg=dict(type='BN2d', requires_grad=False),
-        norm_eval=True,
-        style='caffe',
-        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
-        stage_with_dcn=(False, False, True, True)),
-    img_neck=dict(
-        type='FPN',
-        in_channels=[256, 512, 1024, 2048],
-        out_channels=256,
-        start_level=1,
-        add_extra_convs='on_output',
-        num_outs=4,
-        relu_before_extra_convs=True),
-    pts_bbox_head=dict(
-        type='Detr3DHead',
-        num_query=900,
-        num_classes=10,
-        in_channels=256,
-        sync_cls_avg_factor=True,
-        with_box_refine=True,
-        as_two_stage=False,
-        transformer=dict(
-            type='Detr3DTransformer',
-            decoder=dict(
-                type='Detr3DTransformerDecoder',
-                num_layers=6,
-                return_intermediate=True,
-                transformerlayers=dict(
-                    type='DetrTransformerDecoderLayer',
-                    attn_cfgs=[
-                        dict(
-                            type='MultiheadAttention',
-                            embed_dims=256,
-                            num_heads=8,
-                            dropout=0.1),
-                        dict(
-                            type='Detr3DCrossAtten',
-                            pc_range=point_cloud_range,
-                            num_points=1,
-                            embed_dims=256)
-                    ],
-                    feedforward_channels=512,
-                    ffn_dropout=0.1,
-                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
-                                     'ffn', 'norm')))),
-        bbox_coder=dict(
-            type='NMSFreeCoder',
-            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
-            pc_range=point_cloud_range,
-            max_num=300,
-            voxel_size=voxel_size,
-            num_classes=10), 
-        positional_encoding=dict(
-            type='SinePositionalEncoding',
-            num_feats=128,
-            normalize=True,
-            offset=-0.5),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=2.0),
-        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
-        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
-    # model training and testing settings
-    train_cfg=dict(pts=dict(
-        grid_size=[512, 512, 1],
-        voxel_size=voxel_size,
-        point_cloud_range=point_cloud_range,
-        out_size_factor=4,
-        assigner=dict(
-            type='HungarianAssigner3D',
-            cls_cost=dict(type='FocalLossCost', weight=2.0),
-            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
-            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
-            pc_range=point_cloud_range))))
-
-dataset_type = 'NuScenesDataset'
-data_root = 'data/nuscenes/'
-
-file_client_args = dict(backend='disk')
-
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
-    rate=1.0,
-    prepare=dict(
-        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(
-            car=5,
-            truck=5,
-            bus=5,
-            trailer=5,
-            construction_vehicle=5,
-            traffic_cone=5,
-            barrier=5,
-            motorcycle=5,
-            bicycle=5,
-            pedestrian=5)),
-    classes=class_names,
-    sample_groups=dict(
-        car=2,
-        truck=3,
-        construction_vehicle=7,
-        bus=4,
-        trailer=6,
-        barrier=2,
-        motorcycle=6,
-        bicycle=6,
-        pedestrian=2,
-        traffic_cone=2),
-    points_loader=dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=[0, 1, 2, 3, 4],
-        file_client_args=file_client_args))
-
-train_pipeline = [
-    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
-    dict(type='PhotoMetricDistortionMultiViewImage'),
-    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectNameFilter', classes=class_names),
-    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
-    dict(type='PadMultiViewImage', size_divisor=32),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
-]
-test_pipeline = [
-    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
-    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
-    dict(type='PadMultiViewImage', size_divisor=32),
-    dict(
-        type='MultiScaleFlipAug3D',
-        img_scale=(1333, 800),
-        pts_scale_ratio=1,
-        flip=False,
-        transforms=[
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['img'])
-        ])
-]
-
-
-data = dict(
-    samples_per_gpu=1,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'nuscenes_infos_train.pkl',
-        pipeline=train_pipeline,
-        classes=class_names,
-        modality=input_modality,
-        test_mode=False,
-        use_valid_flag=True,
-        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
-        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
-        box_type_3d='LiDAR'),
-    val=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality),
-    test=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality))
-
-optimizer = dict(
-    type='AdamW', 
-    lr=2e-4,
-    paramwise_cfg=dict(
-        custom_keys={
-            'img_backbone': dict(lr_mult=0.1),
-        }),
-    weight_decay=0.01)
-optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='CosineAnnealing',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=1.0 / 3,
-    min_lr_ratio=1e-3)
-total_epochs = 24
-evaluation = dict(interval=2, pipeline=test_pipeline)
-
-runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
-load_from='ckpts/fcos3d.pth'
diff --git a/projects/configs/detr3d/detr3d_res101_gridmask_cbgs.py b/projects/configs/detr3d/detr3d_res101_gridmask_cbgs.py
deleted file mode 100755
index b23d7b3..0000000
--- a/projects/configs/detr3d/detr3d_res101_gridmask_cbgs.py
+++ /dev/null
@@ -1,228 +0,0 @@
-_base_ = [
-    '../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
-    '../../../mmdetection3d/configs/_base_/default_runtime.py'
-]
-
-plugin=True
-plugin_dir='projects/mmdet3d_plugin/'
-
-# If point cloud range is changed, the models should also change their point
-# cloud range accordingly
-point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
-voxel_size = [0.2, 0.2, 8]
-
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-# For nuScenes we usually do 10-class detection
-class_names = [
-    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
-    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
-]
-
-input_modality = dict(
-    use_lidar=False,
-    use_camera=True,
-    use_radar=False,
-    use_map=False,
-    use_external=False)
-
-model = dict(
-    type='Detr3D',
-    use_grid_mask=True,
-    img_backbone=dict(
-        type='ResNet',
-        depth=101,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        norm_cfg=dict(type='BN2d', requires_grad=False),
-        norm_eval=True,
-        style='caffe',
-        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
-        stage_with_dcn=(False, False, True, True)),
-    img_neck=dict(
-        type='FPN',
-        in_channels=[256, 512, 1024, 2048],
-        out_channels=256,
-        start_level=1,
-        add_extra_convs='on_output',
-        num_outs=4,
-        relu_before_extra_convs=True),
-    pts_bbox_head=dict(
-        type='Detr3DHead',
-        num_query=900,
-        num_classes=10,
-        in_channels=256,
-        sync_cls_avg_factor=True,
-        with_box_refine=True,
-        as_two_stage=False,
-        transformer=dict(
-            type='Detr3DTransformer',
-            decoder=dict(
-                type='Detr3DTransformerDecoder',
-                num_layers=6,
-                return_intermediate=True,
-                transformerlayers=dict(
-                    type='DetrTransformerDecoderLayer',
-                    attn_cfgs=[
-                        dict(
-                            type='MultiheadAttention',
-                            embed_dims=256,
-                            num_heads=8,
-                            dropout=0.1),
-                        dict(
-                            type='Detr3DCrossAtten',
-                            pc_range=point_cloud_range,
-                            num_points=1,
-                            embed_dims=256)
-                    ],
-                    feedforward_channels=512,
-                    ffn_dropout=0.1,
-                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
-                                     'ffn', 'norm')))),
-        bbox_coder=dict(
-            type='NMSFreeCoder',
-            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
-            pc_range=point_cloud_range,
-            max_num=300,
-            voxel_size=voxel_size,
-            num_classes=10), 
-        positional_encoding=dict(
-            type='SinePositionalEncoding',
-            num_feats=128,
-            normalize=True,
-            offset=-0.5),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=2.0),
-        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
-        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
-    # model training and testing settings
-    train_cfg=dict(pts=dict(
-        grid_size=[512, 512, 1],
-        voxel_size=voxel_size,
-        point_cloud_range=point_cloud_range,
-        out_size_factor=4,
-        assigner=dict(
-            type='HungarianAssigner3D',
-            cls_cost=dict(type='FocalLossCost', weight=2.0),
-            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
-            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
-            pc_range=point_cloud_range))))
-
-dataset_type = 'NuScenesDataset'
-data_root = 'data/nuscenes/'
-
-file_client_args = dict(backend='disk')
-
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
-    rate=1.0,
-    prepare=dict(
-        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(
-            car=5,
-            truck=5,
-            bus=5,
-            trailer=5,
-            construction_vehicle=5,
-            traffic_cone=5,
-            barrier=5,
-            motorcycle=5,
-            bicycle=5,
-            pedestrian=5)),
-    classes=class_names,
-    sample_groups=dict(
-        car=2,
-        truck=3,
-        construction_vehicle=7,
-        bus=4,
-        trailer=6,
-        barrier=2,
-        motorcycle=6,
-        bicycle=6,
-        pedestrian=2,
-        traffic_cone=2),
-    points_loader=dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=[0, 1, 2, 3, 4],
-        file_client_args=file_client_args))
-
-train_pipeline = [
-    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
-    dict(type='PhotoMetricDistortionMultiViewImage'),
-    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectNameFilter', classes=class_names),
-    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
-    dict(type='PadMultiViewImage', size_divisor=32),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
-]
-test_pipeline = [
-    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
-    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
-    dict(type='PadMultiViewImage', size_divisor=32),
-    dict(
-        type='MultiScaleFlipAug3D',
-        img_scale=(1333, 800),
-        pts_scale_ratio=1,
-        flip=False,
-        transforms=[
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['img'])
-        ])
-]
-
-
-data = dict(
-    samples_per_gpu=1,
-    workers_per_gpu=4,
-    train=dict(
-        type='CBGSDataset',
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file=data_root + 'nuscenes_infos_train.pkl',
-            pipeline=train_pipeline,
-            classes=class_names,
-            modality=input_modality,
-            test_mode=False,
-            use_valid_flag=True,
-            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
-            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
-            box_type_3d='LiDAR'),
-    ),
-    val=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality),
-    test=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality))
-
-optimizer = dict(
-    type='AdamW', 
-    lr=2e-4,
-    paramwise_cfg=dict(
-        custom_keys={
-            'img_backbone': dict(lr_mult=0.1),
-        }),
-    weight_decay=0.01)
-optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='CosineAnnealing',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=1.0 / 3,
-    min_lr_ratio=1e-3)
-total_epochs = 24
-evaluation = dict(interval=2, pipeline=test_pipeline)
-
-runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
-load_from='pretrained/fcos3d.pth'
diff --git a/projects/configs/detr3d/detr3d_vovnet_gridmask_det_final_trainval_cbgs.py b/projects/configs/detr3d/detr3d_vovnet_gridmask_det_final_trainval_cbgs.py
deleted file mode 100755
index 4366532..0000000
--- a/projects/configs/detr3d/detr3d_vovnet_gridmask_det_final_trainval_cbgs.py
+++ /dev/null
@@ -1,227 +0,0 @@
-_base_ = [
-    '../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
-    '../../../mmdetection3d/configs/_base_/default_runtime.py'
-]
-
-plugin=True
-plugin_dir='projects/mmdet3d_plugin/'
-
-# If point cloud range is changed, the models should also change their point
-# cloud range accordingly
-point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
-voxel_size = [0.2, 0.2, 8]
-
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False)
-# For nuScenes we usually do 10-class detection
-class_names = [
-    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
-    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
-]
-
-input_modality = dict(
-    use_lidar=False,
-    use_camera=True,
-    use_radar=False,
-    use_map=False,
-    use_external=True)
-
-model = dict(
-    type='Detr3D',
-    use_grid_mask=True,
-    img_backbone=dict(
-        type='VoVNet',
-        spec_name='V-99-eSE',
-        norm_eval=True,
-        frozen_stages=1,
-        input_ch=3,
-        out_features=['stage2', 'stage3', 'stage4', 'stage5']),
-    img_neck=dict(
-        type='FPN',
-        in_channels=[256, 512, 768, 1024],
-        out_channels=256,
-        start_level=0,
-        add_extra_convs='on_output',
-        num_outs=4,
-        relu_before_extra_convs=True),
-    pts_bbox_head=dict(
-        type='Detr3DHead',
-        num_query=900,
-        num_classes=10,
-        in_channels=256,
-        sync_cls_avg_factor=True,
-        with_box_refine=True,
-        as_two_stage=False,
-        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
-        transformer=dict(
-            type='Detr3DTransformer',
-            decoder=dict(
-                type='Detr3DTransformerDecoder',
-                num_layers=6,
-                return_intermediate=True,
-                transformerlayers=dict(
-                    type='DetrTransformerDecoderLayer',
-                    attn_cfgs=[
-                        dict(
-                            type='MultiheadAttention',
-                            embed_dims=256,
-                            num_heads=8,
-                            dropout=0.1),
-                        dict(
-                            type='Detr3DCrossAtten',
-                            pc_range=point_cloud_range,
-                            num_points=1,
-                            embed_dims=256)
-                    ],
-                    feedforward_channels=512,
-                    ffn_dropout=0.1,
-                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
-                                     'ffn', 'norm')))),
-        bbox_coder=dict(
-            type='NMSFreeCoder',
-            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
-            pc_range=point_cloud_range,
-            max_num=300,
-            voxel_size=voxel_size,
-            num_classes=10), 
-        positional_encoding=dict(
-            type='SinePositionalEncoding',
-            num_feats=128,
-            normalize=True,
-            offset=-0.5),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=2.0),
-        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
-        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
-    # model training and testing settings
-    train_cfg=dict(pts=dict(
-        grid_size=[512, 512, 1],
-        voxel_size=voxel_size,
-        point_cloud_range=point_cloud_range,
-        out_size_factor=4,
-        assigner=dict(
-            type='HungarianAssigner3D',
-            cls_cost=dict(type='FocalLossCost', weight=2.0),
-            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
-            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
-            pc_range=point_cloud_range))))
-
-dataset_type = 'NuScenesDataset'
-data_root = 'data/nuscenes/'
-
-file_client_args = dict(backend='disk')
-
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
-    rate=1.0,
-    prepare=dict(
-        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(
-            car=5,
-            truck=5,
-            bus=5,
-            trailer=5,
-            construction_vehicle=5,
-            traffic_cone=5,
-            barrier=5,
-            motorcycle=5,
-            bicycle=5,
-            pedestrian=5)),
-    classes=class_names,
-    sample_groups=dict(
-        car=2,
-        truck=3,
-        construction_vehicle=7,
-        bus=4,
-        trailer=6,
-        barrier=2,
-        motorcycle=6,
-        bicycle=6,
-        pedestrian=2,
-        traffic_cone=2),
-    points_loader=dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=[0, 1, 2, 3, 4],
-        file_client_args=file_client_args))
-
-train_pipeline = [
-    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
-    dict(type='PhotoMetricDistortionMultiViewImage'),
-    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectNameFilter', classes=class_names),
-    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
-    dict(type='PadMultiViewImage', size_divisor=32),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
-]
-test_pipeline = [
-    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
-    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
-    dict(type='PadMultiViewImage', size_divisor=32),
-    dict(
-        type='MultiScaleFlipAug3D',
-        img_scale=(1333, 800),
-        pts_scale_ratio=1,
-        flip=False,
-        transforms=[
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['img'])
-        ])
-]
-
-
-data = dict(
-    samples_per_gpu=1,
-    workers_per_gpu=4,
-    train=dict(
-        type='CBGSDataset',
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file=data_root + 'nuscenes_infos_trainval.pkl',
-            pipeline=train_pipeline,
-            classes=class_names,
-            modality=input_modality,
-            test_mode=False,
-            use_valid_flag=True,
-            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
-            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
-            box_type_3d='LiDAR'),
-    ),
-    val=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality),
-    test=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality))
-
-optimizer = dict(
-    type='AdamW', 
-    lr=2e-4,
-    paramwise_cfg=dict(
-        custom_keys={
-            'img_backbone': dict(lr_mult=0.1),
-        }),
-    weight_decay=0.01)
-optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='CosineAnnealing',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=1.0 / 3,
-    min_lr_ratio=1e-3)
-total_epochs = 24
-evaluation = dict(interval=2, pipeline=test_pipeline)
-
-runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
-load_from='ckpts/dd3d_det_final.pth'
-
-find_unused_parameters=True
diff --git a/projects/configs/obj_dgcnn/pillar.py b/projects/configs/obj_dgcnn/pillar.py
deleted file mode 100755
index 39485de..0000000
--- a/projects/configs/obj_dgcnn/pillar.py
+++ /dev/null
@@ -1,295 +0,0 @@
-_base_ = [
-    '../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
-    '../../../mmdetection3d/configs/_base_/schedules/cyclic_20e.py', 
-    '../../../mmdetection3d/configs/_base_/default_runtime.py'
-]
-
-plugin=True
-plugin_dir='projects/mmdet3d_plugin/'
-
-# If point cloud range is changed, the models should also change their point
-# cloud range accordingly
-point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
-voxel_size = [0.2, 0.2, 8]
-
-# For nuScenes we usually do 10-class detection
-class_names = [
-    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
-    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
-]
-
-input_modality = dict(
-    use_lidar=True,
-    use_camera=False,
-    use_radar=False,
-    use_map=False,
-    use_external=False)
-
-model = dict(
-    type='ObjDGCNN',
-    pts_voxel_layer=dict(
-        max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000),
-        point_cloud_range=point_cloud_range),
-    pts_voxel_encoder=dict(
-        type='PillarFeatureNet',
-        in_channels=5,
-        feat_channels=[64],
-        with_distance=False,
-        voxel_size=(0.2, 0.2, 8),
-        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-        point_cloud_range=point_cloud_range,
-        legacy=False),
-    pts_middle_encoder=dict(
-        type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)),
-    pts_backbone=dict(
-        type='SECOND',
-        in_channels=64,
-        out_channels=[64, 128, 256],
-        layer_nums=[3, 5, 5],
-        layer_strides=[2, 2, 2],
-        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
-        conv_cfg=dict(type='Conv2d', bias=False)),
-    pts_neck=dict(
-        type='FPN',
-        norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.01),
-        act_cfg=dict(type='ReLU'),
-        in_channels=[64, 128, 256],
-        out_channels=256,
-        start_level=0,
-        num_outs=4),
-    pts_bbox_head=dict(
-        type='DGCNN3DHead',
-        num_query=300,
-        num_classes=10,
-        in_channels=256,
-        sync_cls_avg_factor=True,
-        with_box_refine=True,
-        as_two_stage=False,
-        transformer=dict(
-            type='DeformableDetrTransformer',
-            encoder=dict(
-                type='DetrTransformerEncoder',
-                num_layers=2,
-                transformerlayers=dict(
-                    type='BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='MultiScaleDeformableAttention', embed_dims=256),
-                    feedforward_channels=512,
-                    ffn_dropout=0.1,
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
-            decoder=dict(
-                type='Deformable3DDetrTransformerDecoder',
-                num_layers=6,
-                return_intermediate=True,
-                transformerlayers=dict(
-                    type='DetrTransformerDecoderLayer',
-                    attn_cfgs=[
-                        dict(
-                            type='DGCNNAttn',
-                            embed_dims=256,
-                            num_heads=8,
-                            K=16,
-                            dropout=0.1),
-                        dict(
-                            type='MultiScaleDeformableAttention',
-                            embed_dims=256)
-                    ],
-                    feedforward_channels=512,
-                    ffn_dropout=0.1,
-                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
-                                     'ffn', 'norm')))),
-        bbox_coder=dict(
-            type='NMSFreeCoder',
-            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
-            pc_range=point_cloud_range,
-            max_num=300,
-            voxel_size=voxel_size,
-            num_classes=10), 
-        positional_encoding=dict(
-            type='SinePositionalEncoding',
-            num_feats=128,
-            normalize=True,
-            offset=-0.5),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=2.0),
-        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
-        loss_iou=dict(type='GIoULoss', loss_weight=0.0)), # For DETR compatibility. 
-    # model training and testing settings
-    train_cfg=dict(pts=dict(
-        grid_size=[512, 512, 1],
-        voxel_size=voxel_size,
-        point_cloud_range=point_cloud_range,
-        out_size_factor=4,
-        assigner=dict(
-            type='HungarianAssigner3D',
-            cls_cost=dict(type='FocalLossCost', weight=2.0),
-            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
-            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
-            pc_range=point_cloud_range))))
-
-dataset_type = 'NuScenesDataset'
-data_root = 'data/nuscenes/'
-
-file_client_args = dict(backend='disk')
-
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
-    rate=1.0,
-    prepare=dict(
-        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(
-            car=5,
-            truck=5,
-            bus=5,
-            trailer=5,
-            construction_vehicle=5,
-            traffic_cone=5,
-            barrier=5,
-            motorcycle=5,
-            bicycle=5,
-            pedestrian=5)),
-    classes=class_names,
-    sample_groups=dict(
-        car=2,
-        truck=3,
-        construction_vehicle=7,
-        bus=4,
-        trailer=6,
-        barrier=2,
-        motorcycle=6,
-        bicycle=6,
-        pedestrian=2,
-        traffic_cone=2),
-    points_loader=dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=[0, 1, 2, 3, 4],
-        file_client_args=file_client_args))
-
-train_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=5,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadPointsFromMultiSweeps',
-        sweeps_num=9,
-        use_dim=[0, 1, 2, 3, 4],
-        file_client_args=file_client_args,
-        pad_empty_sweeps=True,
-        remove_close=True),
-    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
-    dict(
-        type='GlobalRotScaleTrans',
-        rot_range=[-0.3925, 0.3925],
-        scale_ratio_range=[0.95, 1.05],
-        translation_std=[0, 0, 0]),
-    dict(
-        type='RandomFlip3D',
-        sync_2d=False,
-        flip_ratio_bev_horizontal=0.5,
-        flip_ratio_bev_vertical=0.5),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectNameFilter', classes=class_names),
-    dict(type='PointShuffle'),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=5,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadPointsFromMultiSweeps',
-        sweeps_num=9,
-        use_dim=[0, 1, 2, 3, 4],
-        file_client_args=file_client_args,
-        pad_empty_sweeps=True,
-        remove_close=True),
-    dict(
-        type='MultiScaleFlipAug3D',
-        img_scale=(1333, 800),
-        pts_scale_ratio=1,
-        flip=False,
-        transforms=[
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ])
-]
-
-# construct a pipeline for data and gt loading in show function
-# please keep its loading function consistent with test_pipeline (e.g. client)
-eval_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=5,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadPointsFromMultiSweeps',
-        sweeps_num=9,
-        use_dim=[0, 1, 2, 3, 4],
-        file_client_args=file_client_args,
-        pad_empty_sweeps=True,
-        remove_close=True),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points'])
-]
-
-data = dict(
-    samples_per_gpu=3,
-    workers_per_gpu=4,
-    train=dict(
-        type='CBGSDataset',
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file=data_root + 'nuscenes_infos_train.pkl',
-            pipeline=train_pipeline,
-            classes=class_names,
-            modality=input_modality,
-            test_mode=False,
-            use_valid_flag=True,
-            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
-            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
-            box_type_3d='LiDAR'),
-    ),
-    val=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality),
-    test=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality))
-
-evaluation = dict(interval=20, pipeline=eval_pipeline)
-
-runner = dict(type='EpochBasedRunner', max_epochs=20)
-
-optimizer = dict(
-    type='AdamW', 
-    lr=1e-4,
-    paramwise_cfg=dict(
-        custom_keys={
-            'pts_voxel_encoder': dict(lr_mult=0.1),
-            'SECOND': dict(lr_mult=0.1),
-        }),
-    weight_decay=0.01)
-optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
-
-load_from='ckpts/pillar.pth'
-find_unused_parameters = True
\ No newline at end of file
diff --git a/projects/configs/obj_dgcnn/voxel.py b/projects/configs/obj_dgcnn/voxel.py
deleted file mode 100755
index a7a76be..0000000
--- a/projects/configs/obj_dgcnn/voxel.py
+++ /dev/null
@@ -1,293 +0,0 @@
-_base_ = [
-    '../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
-    '../../../mmdetection3d/configs/_base_/schedules/cyclic_20e.py', 
-    '../../../mmdetection3d/configs/_base_/default_runtime.py'
-]
-
-plugin=True
-plugin_dir='projects/mmdet3d_plugin/'
-
-# If point cloud range is changed, the models should also change their point
-# cloud range accordingly
-point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
-voxel_size = [0.1, 0.1, 0.2]
-
-# For nuScenes we usually do 10-class detection
-class_names = [
-    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
-    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
-]
-
-input_modality = dict(
-    use_lidar=True,
-    use_camera=False,
-    use_radar=False,
-    use_map=False,
-    use_external=False)
-
-model = dict(
-    type='ObjDGCNN',
-    pts_voxel_layer=dict(
-        max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000),
-        point_cloud_range=point_cloud_range),
-    pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
-    pts_middle_encoder=dict(
-        type='SparseEncoder',
-        in_channels=5,
-        sparse_shape=[41, 1024, 1024],
-        output_channels=128,
-        order=('conv', 'norm', 'act'),
-        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
-                                                                      128)),
-        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
-        block_type='basicblock'),
-    pts_backbone=dict(
-        type='SECOND',
-        in_channels=256,
-        out_channels=[128, 256],
-        layer_nums=[5, 5],
-        layer_strides=[1, 2],
-        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
-        conv_cfg=dict(type='Conv2d', bias=False)),
-    pts_neck=dict(
-        type='FPN',
-        norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.01),
-        act_cfg=dict(type='ReLU'),
-        in_channels=[128, 256],
-        out_channels=256,
-        start_level=0,
-        num_outs=4),
-    pts_bbox_head=dict(
-        type='DGCNN3DHead',
-        num_query=300,
-        num_classes=10,
-        in_channels=256,
-        sync_cls_avg_factor=True,
-        with_box_refine=True,
-        as_two_stage=False,
-        transformer=dict(
-            type='DeformableDetrTransformer',
-            encoder=dict(
-                type='DetrTransformerEncoder',
-                num_layers=2,
-                transformerlayers=dict(
-                    type='BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='MultiScaleDeformableAttention', embed_dims=256),
-                    feedforward_channels=512,
-                    ffn_dropout=0.1,
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
-            decoder=dict(
-                type='Deformable3DDetrTransformerDecoder',
-                num_layers=6,
-                return_intermediate=True,
-                transformerlayers=dict(
-                    type='DetrTransformerDecoderLayer',
-                    attn_cfgs=[
-                        dict(
-                            type='DGCNNAttn',
-                            embed_dims=256,
-                            num_heads=8,
-                            K=16,
-                            dropout=0.1),
-                        dict(
-                            type='MultiScaleDeformableAttention',
-                            embed_dims=256)
-                    ],
-                    feedforward_channels=512,
-                    ffn_dropout=0.1,
-                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
-                                     'ffn', 'norm')))),
-        bbox_coder=dict(
-            type='NMSFreeCoder',
-            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
-            pc_range=point_cloud_range,
-            max_num=300,
-            voxel_size=voxel_size,
-            num_classes=10), 
-        positional_encoding=dict(
-            type='SinePositionalEncoding',
-            num_feats=128,
-            normalize=True,
-            offset=-0.5),
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=2.0),
-        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
-        loss_iou=dict(type='GIoULoss', loss_weight=0.0)), # For DETR compatibility. 
-     # model training and testing settings
-    train_cfg=dict(pts=dict(
-        grid_size=[512, 512, 1],
-        voxel_size=voxel_size,
-        point_cloud_range=point_cloud_range,
-        out_size_factor=4,
-        assigner=dict(
-            type='HungarianAssigner3D',
-            cls_cost=dict(type='FocalLossCost', weight=2.0),
-            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
-            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
-            pc_range=point_cloud_range))))
-
-dataset_type = 'NuScenesDataset'
-data_root = 'data/nuscenes/'
-
-file_client_args = dict(backend='disk')
-
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
-    rate=1.0,
-    prepare=dict(
-        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(
-            car=5,
-            truck=5,
-            bus=5,
-            trailer=5,
-            construction_vehicle=5,
-            traffic_cone=5,
-            barrier=5,
-            motorcycle=5,
-            bicycle=5,
-            pedestrian=5)),
-    classes=class_names,
-    sample_groups=dict(
-        car=2,
-        truck=3,
-        construction_vehicle=7,
-        bus=4,
-        trailer=6,
-        barrier=2,
-        motorcycle=6,
-        bicycle=6,
-        pedestrian=2,
-        traffic_cone=2),
-    points_loader=dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=[0, 1, 2, 3, 4],
-        file_client_args=file_client_args))
-
-train_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=5,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadPointsFromMultiSweeps',
-        sweeps_num=9,
-        use_dim=[0, 1, 2, 3, 4],
-        file_client_args=file_client_args,
-        pad_empty_sweeps=True,
-        remove_close=True),
-    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
-    dict(
-        type='GlobalRotScaleTrans',
-        rot_range=[-0.3925, 0.3925],
-        scale_ratio_range=[0.95, 1.05],
-        translation_std=[0, 0, 0]),
-    dict(
-        type='RandomFlip3D',
-        sync_2d=False,
-        flip_ratio_bev_horizontal=0.5,
-        flip_ratio_bev_vertical=0.5),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectNameFilter', classes=class_names),
-    dict(type='PointShuffle'),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=5,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadPointsFromMultiSweeps',
-        sweeps_num=9,
-        use_dim=[0, 1, 2, 3, 4],
-        file_client_args=file_client_args,
-        pad_empty_sweeps=True,
-        remove_close=True),
-    dict(
-        type='MultiScaleFlipAug3D',
-        img_scale=(1333, 800),
-        pts_scale_ratio=1,
-        flip=False,
-        transforms=[
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ])
-]
-
-# construct a pipeline for data and gt loading in show function
-# please keep its loading function consistent with test_pipeline (e.g. client)
-eval_pipeline = [
-    dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=5,
-        file_client_args=file_client_args),
-    dict(
-        type='LoadPointsFromMultiSweeps',
-        sweeps_num=9,
-        use_dim=[0, 1, 2, 3, 4],
-        file_client_args=file_client_args,
-        pad_empty_sweeps=True,
-        remove_close=True),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['points'])
-]
-
-data = dict(
-    samples_per_gpu=4,
-    workers_per_gpu=6,
-    train=dict(
-        type='CBGSDataset',
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file=data_root + 'nuscenes_infos_train.pkl',
-            pipeline=train_pipeline,
-            classes=class_names,
-            modality=input_modality,
-            test_mode=False,
-            use_valid_flag=True,
-            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
-            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
-            box_type_3d='LiDAR'),
-    ),
-    val=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality),
-    test=dict(pipeline=test_pipeline, classes=class_names, modality=input_modality))
-
-evaluation = dict(interval=20, pipeline=eval_pipeline)
-
-runner = dict(type='EpochBasedRunner', max_epochs=20)
-optimizer = dict(
-    type='AdamW', 
-    lr=1e-4,
-    paramwise_cfg=dict(
-        custom_keys={
-            'pts_voxel_encoder': dict(lr_mult=0.1),
-            'SECOND': dict(lr_mult=0.1),
-        }),
-    weight_decay=0.01)
-optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
-load_from='pretrained/voxel.pth'
-find_unused_parameters = True
\ No newline at end of file
diff --git a/projects/mmdet3d_plugin/__init__.py b/projects/mmdet3d_plugin/__init__.py
old mode 100755
new mode 100644
index b0e5d56..8c4ea6d
--- a/projects/mmdet3d_plugin/__init__.py
+++ b/projects/mmdet3d_plugin/__init__.py
@@ -1,16 +1,11 @@
-from .core.bbox.assigners.hungarian_assigner_3d import HungarianAssigner3D
-from .core.bbox.coders.nms_free_coder import NMSFreeCoder
-from .core.bbox.match_costs import BBox3DL1Cost
-from .datasets import CustomNuScenesDataset
-from .datasets.pipelines import (
-  PhotoMetricDistortionMultiViewImage, PadMultiViewImage, 
-  NormalizeMultiviewImage, CropMultiViewImage, RandomScaleImageMultiViewImage,
-  HorizontalRandomFlipMultiViewImage)
+from .datasets.transforms import PhotoMetricDistortionMultiViewImage
+# from .datasets import custom_nuscenes
 from .models.backbones.vovnet import VoVNet
-from .models.detectors.obj_dgcnn import ObjDGCNN
-from .models.detectors.detr3d import Detr3D
-from .models.dense_heads.dgcnn3d_head import DGCNN3DHead
 from .models.dense_heads.detr3d_head import Detr3DHead
-from .models.utils.detr import Deformable3DDetrTransformerDecoder
-from .models.utils.dgcnn_attn import DGCNNAttn
-from .models.utils.detr3d_transformer import Detr3DTransformer, Detr3DTransformerDecoder, Detr3DCrossAtten
+from .models.detectors.detr3d import Detr3D
+from .models.task_modules.hungarian_assigner_3d import HungarianAssigner3D
+from .models.task_modules.match_cost import BBox3DL1Cost
+from .models.task_modules.nms_free_coder import NMSFreeCoder
+from .models.utils.detr3d_transformer import (Detr3DCrossAtten,
+                                              Detr3DTransformer,
+                                              Detr3DTransformerDecoder)
diff --git a/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py b/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py
deleted file mode 100755
index 23156d8..0000000
--- a/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .hungarian_assigner_3d import HungarianAssigner3D
-
-__all__ = ['HungarianAssigner3D']
diff --git a/projects/mmdet3d_plugin/core/bbox/coders/__init__.py b/projects/mmdet3d_plugin/core/bbox/coders/__init__.py
deleted file mode 100755
index c670f5e..0000000
--- a/projects/mmdet3d_plugin/core/bbox/coders/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .nms_free_coder import NMSFreeCoder
-
-__all__ = ['NMSFreeCoder']
diff --git a/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py b/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py
deleted file mode 100755
index aac1a82..0000000
--- a/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from mmdet.core.bbox.match_costs import build_match_cost
-from .match_cost import BBox3DL1Cost
-
-__all__ = ['build_match_cost', 'BBox3DL1Cost']
\ No newline at end of file
diff --git a/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py b/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py
deleted file mode 100755
index d9678f3..0000000
--- a/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import torch
-from mmdet.core.bbox.match_costs.builder import MATCH_COST
-
-
-@MATCH_COST.register_module()
-class BBox3DL1Cost(object):
-    """BBox3DL1Cost.
-     Args:
-         weight (int | float, optional): loss_weight
-    """
-
-    def __init__(self, weight=1.):
-        self.weight = weight
-
-    def __call__(self, bbox_pred, gt_bboxes):
-        """
-        Args:
-            bbox_pred (Tensor): Predicted boxes with normalized coordinates
-                (cx, cy, w, h), which are all in range [0, 1]. Shape
-                [num_query, 4].
-            gt_bboxes (Tensor): Ground truth boxes with normalized
-                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
-        Returns:
-            torch.Tensor: bbox_cost value with weight
-        """
-        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
-        return bbox_cost * self.weight
\ No newline at end of file
diff --git a/projects/mmdet3d_plugin/core/bbox/util.py b/projects/mmdet3d_plugin/core/bbox/util.py
deleted file mode 100755
index 764df44..0000000
--- a/projects/mmdet3d_plugin/core/bbox/util.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import torch 
-
-
-def normalize_bbox(bboxes, pc_range):
-
-    cx = bboxes[..., 0:1]
-    cy = bboxes[..., 1:2]
-    cz = bboxes[..., 2:3]
-    w = bboxes[..., 3:4].log()
-    l = bboxes[..., 4:5].log()
-    h = bboxes[..., 5:6].log()
-
-    rot = bboxes[..., 6:7]
-    if bboxes.size(-1) > 7:
-        vx = bboxes[..., 7:8] 
-        vy = bboxes[..., 8:9]
-        normalized_bboxes = torch.cat(
-            (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1
-        )
-    else:
-        normalized_bboxes = torch.cat(
-            (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1
-        )
-    return normalized_bboxes
-
-def denormalize_bbox(normalized_bboxes, pc_range):
-    # rotation 
-    rot_sine = normalized_bboxes[..., 6:7]
-
-    rot_cosine = normalized_bboxes[..., 7:8]
-    rot = torch.atan2(rot_sine, rot_cosine)
-
-    # center in the bev
-    cx = normalized_bboxes[..., 0:1]
-    cy = normalized_bboxes[..., 1:2]
-    cz = normalized_bboxes[..., 4:5]
-
-    # size
-    w = normalized_bboxes[..., 2:3]
-    l = normalized_bboxes[..., 3:4]
-    h = normalized_bboxes[..., 5:6]
-
-    w = w.exp() 
-    l = l.exp() 
-    h = h.exp() 
-    if normalized_bboxes.size(-1) > 8:
-         # velocity 
-        vx = normalized_bboxes[:, 8:9]
-        vy = normalized_bboxes[:, 9:10]
-        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
-    else:
-        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
-    return denormalized_bboxes
\ No newline at end of file
diff --git a/projects/mmdet3d_plugin/datasets/__init__.py b/projects/mmdet3d_plugin/datasets/__init__.py
index 99b36cb..ad6c3f9 100644
--- a/projects/mmdet3d_plugin/datasets/__init__.py
+++ b/projects/mmdet3d_plugin/datasets/__init__.py
@@ -1,5 +1 @@
-from .nuscenes_dataset import CustomNuScenesDataset
-
-__all__ = [
-    'CustomNuScenesDataset'
-]
+# from .custom_nus import custom_nuscenes
diff --git a/projects/mmdet3d_plugin/datasets/custom_nus.py b/projects/mmdet3d_plugin/datasets/custom_nus.py
new file mode 100644
index 0000000..75d93dc
--- /dev/null
+++ b/projects/mmdet3d_plugin/datasets/custom_nus.py
@@ -0,0 +1,19 @@
+# from mmdet3d.datasets.nuscenes_dataset import NuScenesDataset
+# from typing import Callable, List, Union
+# from mmdet3d.structures.bbox_3d.utils import get_lidar2img
+# import torch
+
+# from mmdet3d.registry import DATASETS
+# @DATASETS.register_module()
+# class custom_nuscenes(NuScenesDataset):
+#     def __init__(self, **kwargs):
+#         super().__init__(**kwargs)
+
+#     def parse_data_info(self, info: dict) -> Union[List[dict], dict]:
+#         data_info = super().parse_data_info(info)
+#         for cam in data_info['images']:
+#             lidar2cam = torch.Tensor(data_info['images'][cam]['lidar2cam'])
+#             cam2img   = torch.Tensor(data_info['images'][cam]['cam2img'])
+#             lidar2img = get_lidar2img(cam2img, lidar2cam)
+#             data_info['images'][cam]['lidar2img'] = lidar2img.numpy()
+#         return data_info
diff --git a/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py b/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py
deleted file mode 100644
index 1ec4cf2..0000000
--- a/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import numpy as np
-from mmdet.datasets import DATASETS
-from mmdet3d.datasets import NuScenesDataset
-
-
-@DATASETS.register_module()
-class CustomNuScenesDataset(NuScenesDataset):
-    r"""NuScenes Dataset.
-
-    This datset only add camera intrinsics and extrinsics to the results.
-    """
-
-    def get_data_info(self, index):
-        """Get data info according to the given index.
-
-        Args:
-            index (int): Index of the sample data to get.
-
-        Returns:
-            dict: Data information that will be passed to the data \
-                preprocessing pipelines. It includes the following keys:
-
-                - sample_idx (str): Sample index.
-                - pts_filename (str): Filename of point clouds.
-                - sweeps (list[dict]): Infos of sweeps.
-                - timestamp (float): Sample timestamp.
-                - img_filename (str, optional): Image filename.
-                - lidar2img (list[np.ndarray], optional): Transformations \
-                    from lidar to different cameras.
-                - ann_info (dict): Annotation info.
-        """
-        info = self.data_infos[index]
-        # standard protocal modified from SECOND.Pytorch
-        input_dict = dict(
-            sample_idx=info['token'],
-            pts_filename=info['lidar_path'],
-            sweeps=info['sweeps'],
-            timestamp=info['timestamp'] / 1e6,
-        )
-
-        if self.modality['use_camera']:
-            image_paths = []
-            lidar2img_rts = []
-            lidar2cam_rts = []
-            cam_intrinsics = []
-            for cam_type, cam_info in info['cams'].items():
-                image_paths.append(cam_info['data_path'])
-                # obtain lidar to image transformation matrix
-                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
-                lidar2cam_t = cam_info[
-                    'sensor2lidar_translation'] @ lidar2cam_r.T
-                lidar2cam_rt = np.eye(4)
-                lidar2cam_rt[:3, :3] = lidar2cam_r.T
-                lidar2cam_rt[3, :3] = -lidar2cam_t
-                intrinsic = cam_info['cam_intrinsic']
-                viewpad = np.eye(4)
-                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
-                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
-                lidar2img_rts.append(lidar2img_rt)
-
-                cam_intrinsics.append(viewpad)
-                lidar2cam_rts.append(lidar2cam_rt.T)
-
-            input_dict.update(
-                dict(
-                    img_filename=image_paths,
-                    lidar2img=lidar2img_rts,
-                    cam_intrinsic=cam_intrinsics,
-                    lidar2cam=lidar2cam_rts,
-                ))
-
-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-
-        return input_dict
diff --git a/projects/mmdet3d_plugin/datasets/pipelines/__init__.py b/projects/mmdet3d_plugin/datasets/pipelines/__init__.py
deleted file mode 100755
index a246a61..0000000
--- a/projects/mmdet3d_plugin/datasets/pipelines/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from .transform_3d import (
-    PadMultiViewImage, NormalizeMultiviewImage, 
-    PhotoMetricDistortionMultiViewImage, CropMultiViewImage,
-    RandomScaleImageMultiViewImage,
-    HorizontalRandomFlipMultiViewImage)
-
-__all__ = [
-    'PadMultiViewImage', 'NormalizeMultiviewImage', 
-    'PhotoMetricDistortionMultiViewImage', 'CropMultiViewImage',
-    'RandomScaleImageMultiViewImage', 'HorizontalRandomFlipMultiViewImage'
-]
\ No newline at end of file
diff --git a/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py b/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
deleted file mode 100755
index a0302f6..0000000
--- a/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
+++ /dev/null
@@ -1,306 +0,0 @@
-import numpy as np
-from numpy import random
-import mmcv
-from mmdet.datasets.builder import PIPELINES
-
-
-@PIPELINES.register_module()
-class PadMultiViewImage(object):
-    """Pad the multi-view image.
-    There are two padding modes: (1) pad to a fixed size and (2) pad to the
-    minimum size that is divisible by some number.
-    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
-    Args:
-        size (tuple, optional): Fixed padding size.
-        size_divisor (int, optional): The divisor of padded size.
-        pad_val (float, optional): Padding value, 0 by default.
-    """
-
-    def __init__(self, size=None, size_divisor=None, pad_val=0):
-        self.size = size
-        self.size_divisor = size_divisor
-        self.pad_val = pad_val
-        # only one of size and size_divisor should be valid
-        assert size is not None or size_divisor is not None
-        assert size is None or size_divisor is None
-
-    def _pad_img(self, results):
-        """Pad images according to ``self.size``."""
-        if self.size is not None:
-            padded_img = [mmcv.impad(
-                img, shape=self.size, pad_val=self.pad_val) for img in results['img']]
-        elif self.size_divisor is not None:
-            padded_img = [mmcv.impad_to_multiple(
-                img, self.size_divisor, pad_val=self.pad_val) for img in results['img']]
-        results['img'] = padded_img
-        results['img_shape'] = [img.shape for img in padded_img]
-        results['pad_shape'] = [img.shape for img in padded_img]
-        results['pad_fixed_size'] = self.size
-        results['pad_size_divisor'] = self.size_divisor
-
-    def __call__(self, results):
-        """Call function to pad images, masks, semantic segmentation maps.
-        Args:
-            results (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Updated result dict.
-        """
-        self._pad_img(results)
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(size={self.size}, '
-        repr_str += f'size_divisor={self.size_divisor}, '
-        repr_str += f'pad_val={self.pad_val})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class NormalizeMultiviewImage(object):
-    """Normalize the image.
-    Added key is "img_norm_cfg".
-    Args:
-        mean (sequence): Mean values of 3 channels.
-        std (sequence): Std values of 3 channels.
-        to_rgb (bool): Whether to convert the image from BGR to RGB,
-            default is true.
-    """
-
-    def __init__(self, mean, std, to_rgb=True):
-        self.mean = np.array(mean, dtype=np.float32)
-        self.std = np.array(std, dtype=np.float32)
-        self.to_rgb = to_rgb
-
-    def __call__(self, results):
-        """Call function to normalize images.
-        Args:
-            results (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Normalized results, 'img_norm_cfg' key is added into
-                result dict.
-        """
-        results['img'] = [mmcv.imnormalize(
-            img, self.mean, self.std, self.to_rgb) for img in results['img']]
-        results['img_norm_cfg'] = dict(
-            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class PhotoMetricDistortionMultiViewImage:
-    """Apply photometric distortion to image sequentially, every transformation
-    is applied with a probability of 0.5. The position of random contrast is in
-    second or second to last.
-    1. random brightness
-    2. random contrast (mode 0)
-    3. convert color from BGR to HSV
-    4. random saturation
-    5. random hue
-    6. convert color from HSV to BGR
-    7. random contrast (mode 1)
-    8. randomly swap channels
-    Args:
-        brightness_delta (int): delta of brightness.
-        contrast_range (tuple): range of contrast.
-        saturation_range (tuple): range of saturation.
-        hue_delta (int): delta of hue.
-    """
-
-    def __init__(self,
-                 brightness_delta=32,
-                 contrast_range=(0.5, 1.5),
-                 saturation_range=(0.5, 1.5),
-                 hue_delta=18):
-        self.brightness_delta = brightness_delta
-        self.contrast_lower, self.contrast_upper = contrast_range
-        self.saturation_lower, self.saturation_upper = saturation_range
-        self.hue_delta = hue_delta
-
-    def __call__(self, results):
-        """Call function to perform photometric distortion on images.
-        Args:
-            results (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Result dict with images distorted.
-        """
-        imgs = results['img']
-        new_imgs = []
-        for img in imgs:
-            assert img.dtype == np.float32, \
-                'PhotoMetricDistortion needs the input image of dtype np.float32,'\
-                ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
-            # random brightness
-            if random.randint(2):
-                delta = random.uniform(-self.brightness_delta,
-                                    self.brightness_delta)
-                img += delta
-
-            # mode == 0 --> do random contrast first
-            # mode == 1 --> do random contrast last
-            mode = random.randint(2)
-            if mode == 1:
-                if random.randint(2):
-                    alpha = random.uniform(self.contrast_lower,
-                                        self.contrast_upper)
-                    img *= alpha
-
-            # convert color from BGR to HSV
-            img = mmcv.bgr2hsv(img)
-
-            # random saturation
-            if random.randint(2):
-                img[..., 1] *= random.uniform(self.saturation_lower,
-                                            self.saturation_upper)
-
-            # random hue
-            if random.randint(2):
-                img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
-                img[..., 0][img[..., 0] > 360] -= 360
-                img[..., 0][img[..., 0] < 0] += 360
-
-            # convert color from HSV to BGR
-            img = mmcv.hsv2bgr(img)
-
-            # random contrast
-            if mode == 0:
-                if random.randint(2):
-                    alpha = random.uniform(self.contrast_lower,
-                                        self.contrast_upper)
-                    img *= alpha
-
-            # randomly swap channels
-            if random.randint(2):
-                img = img[..., random.permutation(3)]
-            new_imgs.append(img)
-        results['img'] = new_imgs
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
-        repr_str += 'contrast_range='
-        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
-        repr_str += 'saturation_range='
-        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
-        repr_str += f'hue_delta={self.hue_delta})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class CropMultiViewImage(object):
-    """Crop the image
-    Args:
-        size (tuple, optional): Fixed padding size.
-    """
-
-    def __init__(self, size=None):
-        self.size = size
-
-    def __call__(self, results):
-        """Call function to pad images, masks, semantic segmentation maps.
-        Args:
-            results (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Updated result dict.
-        """
-        results['img'] = [img[:self.size[0], :self.size[1], ...] for img in results['img']]
-        results['img_shape'] = [img.shape for img in results['img']]
-        results['img_fixed_size'] = self.size
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(size={self.size}, '
-        return repr_str
-
-
-@PIPELINES.register_module()
-class RandomScaleImageMultiViewImage(object):
-    """Random scale the image
-    Args:
-        scales
-    """
-
-    def __init__(self, scales=[0.5, 1.0, 1.5]):
-        self.scales = scales
-
-    def __call__(self, results):
-        """Call function to pad images, masks, semantic segmentation maps.
-        Args:
-            results (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Updated result dict.
-        """
-        np.random.shuffle(self.scales)
-        rand_scale = self.scales[0]
-        img_shape = results['img_shape'][0]
-        y_size = int(img_shape[0] * rand_scale)
-        x_size = int(img_shape[1] * rand_scale) 
-        scale_factor = np.eye(4)
-        scale_factor[0, 0] *= rand_scale
-        scale_factor[1, 1] *= rand_scale
-        results['img'] = [mmcv.imresize(img, (x_size, y_size), return_scale=False) for img in results['img']]
-        lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']]
-        results['lidar2img'] = lidar2img
-        results['img_shape'] = [img.shape for img in results['img']]
-        results['gt_bboxes_3d'].tensor[:, :6] *= rand_scale
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(size={self.scales}, '
-        return repr_str
-
-
-@PIPELINES.register_module()
-class HorizontalRandomFlipMultiViewImage(object):
-
-    def __init__(self, flip_ratio=0.5):
-        self.flip_ratio = 0.5
-
-    def __call__(self, results):
-        if np.random.rand() >= self.flip_ratio:
-            return results
-        results = self.flip_bbox(results)
-        results = self.flip_cam_params(results)
-        results = self.flip_img(results)
-        return results
-
-    def flip_img(self, results, direction='horizontal'):
-        results['img'] = [mmcv.imflip(img, direction) for img in results['img']]
-        return results
-
-    def flip_cam_params(self, results):
-        flip_factor = np.eye(4)
-        flip_factor[1, 1] = -1
-        lidar2cam = [l2c @ flip_factor for l2c in results['lidar2cam']]
-        w = results['img_shape'][0][1]
-        lidar2img = []
-        for cam_intrinsic, l2c in zip(results['cam_intrinsic'], lidar2cam):
-            cam_intrinsic[0, 2] = w - cam_intrinsic[0, 2]
-            lidar2img.append(cam_intrinsic @ l2c)
-        results['lidar2cam'] = lidar2cam
-        results['lidar2img'] = lidar2img
-        return results
-
-    def flip_bbox(self, input_dict, direction='horizontal'):
-        assert direction in ['horizontal', 'vertical']
-        if len(input_dict['bbox3d_fields']) == 0:  # test mode
-            input_dict['bbox3d_fields'].append('empty_box3d')
-            input_dict['empty_box3d'] = input_dict['box_type_3d'](
-                np.array([], dtype=np.float32))
-        assert len(input_dict['bbox3d_fields']) == 1
-        for key in input_dict['bbox3d_fields']:
-            if 'points' in input_dict:
-                input_dict['points'] = input_dict[key].flip(
-                    direction, points=input_dict['points'])
-            else:
-                input_dict[key].flip(direction)
-        return input_dict
-
diff --git a/projects/mmdet3d_plugin/datasets/transforms/__init__.py b/projects/mmdet3d_plugin/datasets/transforms/__init__.py
new file mode 100644
index 0000000..632b455
--- /dev/null
+++ b/projects/mmdet3d_plugin/datasets/transforms/__init__.py
@@ -0,0 +1 @@
+from .transform_3d import PhotoMetricDistortionMultiViewImage
diff --git a/projects/mmdet3d_plugin/datasets/transforms/transform_3d.py b/projects/mmdet3d_plugin/datasets/transforms/transform_3d.py
new file mode 100644
index 0000000..df46926
--- /dev/null
+++ b/projects/mmdet3d_plugin/datasets/transforms/transform_3d.py
@@ -0,0 +1,218 @@
+import mmcv
+import numpy as np
+from mmdet3d.registry import TRANSFORMS
+from numpy import random
+
+
+@TRANSFORMS.register_module()
+class PhotoMetricDistortionMultiViewImage:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5.
+
+    The position of random contrast is in
+    second or second to last.
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        imgs = results['img']
+        new_imgs = []
+        for img in imgs:
+            assert img.dtype == np.float32, \
+                'PhotoMetricDistortion needs the input image of dtype np.float32,'\
+                ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
+            # random brightness
+            if random.randint(2):
+                delta = random.uniform(-self.brightness_delta,
+                                       self.brightness_delta)
+                img += delta
+
+            # mode == 0 --> do random contrast first
+            # mode == 1 --> do random contrast last
+            mode = random.randint(2)
+            if mode == 1:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                           self.contrast_upper)
+                    img *= alpha
+
+            # convert color from BGR to HSV
+            img = mmcv.bgr2hsv(img)
+
+            # random saturation
+            if random.randint(2):
+                img[..., 1] *= random.uniform(self.saturation_lower,
+                                              self.saturation_upper)
+
+            # random hue
+            if random.randint(2):
+                img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+                img[..., 0][img[..., 0] > 360] -= 360
+                img[..., 0][img[..., 0] < 0] += 360
+
+            # convert color from HSV to BGR
+            img = mmcv.hsv2bgr(img)
+
+            # random contrast
+            if mode == 0:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                           self.contrast_upper)
+                    img *= alpha
+
+            # randomly swap channels
+            if random.randint(2):
+                img = img[..., random.permutation(3)]
+            new_imgs.append(img)
+        results['img'] = new_imgs
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+# @TRANSFORMS.register_module()
+# class CropMultiViewImage(object):
+#     """Crop the image
+#     Args:
+#         size (tuple, optional): Fixed padding size.
+#     """
+
+#     def __init__(self, size=None):
+#         self.size = size
+
+#     def __call__(self, results):
+#         """Call function to pad images, masks, semantic segmentation maps.
+#         Args:
+#             results (dict): Result dict from loading pipeline.
+#         Returns:
+#             dict: Updated result dict.
+#         """
+#         results['img'] = [img[:self.size[0], :self.size[1], ...] for img in results['img']]
+#         results['img_shape'] = [img.shape for img in results['img']]
+#         results['img_fixed_size'] = self.size
+#         return results
+
+#     def __repr__(self):
+#         repr_str = self.__class__.__name__
+#         repr_str += f'(size={self.size}, '
+#         return repr_str
+
+# @TRANSFORMS.register_module()
+# class RandomScaleImageMultiViewImage(object):
+#     """Random scale the image
+#     Args:
+#         scales
+#     """
+
+#     def __init__(self, scales=[0.5, 1.0, 1.5]):
+#         self.scales = scales
+
+#     def __call__(self, results):
+#         """Call function to pad images, masks, semantic segmentation maps.
+#         Args:
+#             results (dict): Result dict from loading pipeline.
+#         Returns:
+#             dict: Updated result dict.
+#         """
+#         np.random.shuffle(self.scales)
+#         rand_scale = self.scales[0]
+#         img_shape = results['img_shape'][0]
+#         y_size = int(img_shape[0] * rand_scale)
+#         x_size = int(img_shape[1] * rand_scale)
+#         scale_factor = np.eye(4)
+#         scale_factor[0, 0] *= rand_scale
+#         scale_factor[1, 1] *= rand_scale
+#         results['img'] = [mmcv.imresize(img, (x_size, y_size), return_scale=False) for img in results['img']]
+#         lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']]
+#         results['lidar2img'] = lidar2img
+#         results['img_shape'] = [img.shape for img in results['img']]
+#         results['gt_bboxes_3d'].tensor[:, :6] *= rand_scale
+#         return results
+
+#     def __repr__(self):
+#         repr_str = self.__class__.__name__
+#         repr_str += f'(size={self.scales}, '
+#         return repr_str
+
+# @TRANSFORMS.register_module()
+# class HorizontalRandomFlipMultiViewImage(object):
+
+#     def __init__(self, flip_ratio=0.5):
+#         self.flip_ratio = 0.5
+
+#     def __call__(self, results):
+#         if np.random.rand() >= self.flip_ratio:
+#             return results
+#         results = self.flip_bbox(results)
+#         results = self.flip_cam_params(results)
+#         results = self.flip_img(results)
+#         return results
+
+#     def flip_img(self, results, direction='horizontal'):
+#         results['img'] = [mmcv.imflip(img, direction) for img in results['img']]
+#         return results
+
+#     def flip_cam_params(self, results):
+#         flip_factor = np.eye(4)
+#         flip_factor[1, 1] = -1
+#         lidar2cam = [l2c @ flip_factor for l2c in results['lidar2cam']]
+#         w = results['img_shape'][0][1]
+#         lidar2img = []
+#         for cam_intrinsic, l2c in zip(results['cam_intrinsic'], lidar2cam):
+#             cam_intrinsic[0, 2] = w - cam_intrinsic[0, 2]
+#             lidar2img.append(cam_intrinsic @ l2c)
+#         results['lidar2cam'] = lidar2cam
+#         results['lidar2img'] = lidar2img
+#         return results
+
+#     def flip_bbox(self, input_dict, direction='horizontal'):
+#         assert direction in ['horizontal', 'vertical']
+#         if len(input_dict['bbox3d_fields']) == 0:  # test mode
+#             input_dict['bbox3d_fields'].append('empty_box3d')
+#             input_dict['empty_box3d'] = input_dict['box_type_3d'](
+#                 np.array([], dtype=np.float32))
+#         assert len(input_dict['bbox3d_fields']) == 1
+#         for key in input_dict['bbox3d_fields']:
+#             if 'points' in input_dict:
+#                 input_dict['points'] = input_dict[key].flip(
+#                     direction, points=input_dict['points'])
+#             else:
+#                 input_dict[key].flip(direction)
+#         return input_dict
diff --git a/projects/mmdet3d_plugin/models/backbones/__init__.py b/projects/mmdet3d_plugin/models/backbones/__init__.py
index cea72f5..418d62a 100755
--- a/projects/mmdet3d_plugin/models/backbones/__init__.py
+++ b/projects/mmdet3d_plugin/models/backbones/__init__.py
@@ -1,3 +1 @@
 from .vovnet import VoVNet
-
-__all__ = ['VoVNet']
\ No newline at end of file
diff --git a/projects/mmdet3d_plugin/models/backbones/vovnet.py b/projects/mmdet3d_plugin/models/backbones/vovnet.py
index 879d186..3490597 100755
--- a/projects/mmdet3d_plugin/models/backbones/vovnet.py
+++ b/projects/mmdet3d_plugin/models/backbones/vovnet.py
@@ -1,31 +1,30 @@
-
 from collections import OrderedDict
-from mmcv.runner import BaseModule
-from mmdet.models.builder import BACKBONES
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from mmdet3d.registry import MODELS
+from mmengine.model import BaseModule
 from torch.nn.modules.batchnorm import _BatchNorm
 
-
 VoVNet19_slim_dw_eSE = {
     'stem': [64, 64, 64],
     'stage_conv_ch': [64, 80, 96, 112],
     'stage_out_ch': [112, 256, 384, 512],
-    "layer_per_block": 3,
-    "block_per_stage": [1, 1, 1, 1],
-    "eSE": True,
-    "dw": True
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': True
 }
 
 VoVNet19_dw_eSE = {
     'stem': [64, 64, 64],
-    "stage_conv_ch": [128, 160, 192, 224],
-    "stage_out_ch": [256, 512, 768, 1024],
-    "layer_per_block": 3,
-    "block_per_stage": [1, 1, 1, 1],
-    "eSE": True,
-    "dw": True
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': True
 }
 
 VoVNet19_slim_eSE = {
@@ -35,89 +34,103 @@
     'layer_per_block': 3,
     'block_per_stage': [1, 1, 1, 1],
     'eSE': True,
-    "dw": False
+    'dw': False
 }
 
 VoVNet19_eSE = {
     'stem': [64, 64, 128],
-    "stage_conv_ch": [128, 160, 192, 224],
-    "stage_out_ch": [256, 512, 768, 1024],
-    "layer_per_block": 3,
-    "block_per_stage": [1, 1, 1, 1],
-    "eSE": True,
-    "dw": False
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': False
 }
 
 VoVNet39_eSE = {
     'stem': [64, 64, 128],
-    "stage_conv_ch": [128, 160, 192, 224],
-    "stage_out_ch": [256, 512, 768, 1024],
-    "layer_per_block": 5,
-    "block_per_stage": [1, 1, 2, 2],
-    "eSE": True,
-    "dw": False
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 1, 2, 2],
+    'eSE': True,
+    'dw': False
 }
 
 VoVNet57_eSE = {
     'stem': [64, 64, 128],
-    "stage_conv_ch": [128, 160, 192, 224],
-    "stage_out_ch": [256, 512, 768, 1024],
-    "layer_per_block": 5,
-    "block_per_stage": [1, 1, 4, 3],
-    "eSE": True,
-    "dw": False
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 1, 4, 3],
+    'eSE': True,
+    'dw': False
 }
 
 VoVNet99_eSE = {
     'stem': [64, 64, 128],
-    "stage_conv_ch": [128, 160, 192, 224],
-    "stage_out_ch": [256, 512, 768, 1024],
-    "layer_per_block": 5,
-    "block_per_stage": [1, 3, 9, 3],
-    "eSE": True,
-    "dw": False
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 3, 9, 3],
+    'eSE': True,
+    'dw': False
 }
 
 _STAGE_SPECS = {
-    "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE,
-    "V-19-dw-eSE": VoVNet19_dw_eSE,
-    "V-19-slim-eSE": VoVNet19_slim_eSE,
-    "V-19-eSE": VoVNet19_eSE,
-    "V-39-eSE": VoVNet39_eSE,
-    "V-57-eSE": VoVNet57_eSE,
-    "V-99-eSE": VoVNet99_eSE,
+    'V-19-slim-dw-eSE': VoVNet19_slim_dw_eSE,
+    'V-19-dw-eSE': VoVNet19_dw_eSE,
+    'V-19-slim-eSE': VoVNet19_slim_eSE,
+    'V-19-eSE': VoVNet19_eSE,
+    'V-39-eSE': VoVNet39_eSE,
+    'V-57-eSE': VoVNet57_eSE,
+    'V-99-eSE': VoVNet99_eSE,
 }
 
 
-def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1):
-    """3x3 convolution with padding"""
+def dw_conv3x3(in_channels,
+               out_channels,
+               module_name,
+               postfix,
+               stride=1,
+               kernel_size=3,
+               padding=1):
+    """3x3 convolution with padding."""
     return [
-        (
-            '{}_{}/dw_conv3x3'.format(module_name, postfix),
-            nn.Conv2d(
-                in_channels,
-                out_channels,
-                kernel_size=kernel_size,
-                stride=stride,
-                padding=padding,
-                groups=out_channels,
-                bias=False
-            )
-        ),
-        (
-            '{}_{}/pw_conv1x1'.format(module_name, postfix),
-            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False)
-        ),
-        ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)),
+        ('{}_{}/dw_conv3x3'.format(module_name, postfix),
+         nn.Conv2d(in_channels,
+                   out_channels,
+                   kernel_size=kernel_size,
+                   stride=stride,
+                   padding=padding,
+                   groups=out_channels,
+                   bias=False)),
+        ('{}_{}/pw_conv1x1'.format(module_name, postfix),
+         nn.Conv2d(in_channels,
+                   out_channels,
+                   kernel_size=1,
+                   stride=1,
+                   padding=0,
+                   groups=1,
+                   bias=False)),
+        ('{}_{}/pw_norm'.format(module_name,
+                                postfix), nn.BatchNorm2d(out_channels)),
         ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)),
     ]
 
 
-def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1):
-    """3x3 convolution with padding"""
+def conv3x3(in_channels,
+            out_channels,
+            module_name,
+            postfix,
+            stride=1,
+            groups=1,
+            kernel_size=3,
+            padding=1):
+    """3x3 convolution with padding."""
     return [
         (
-            f"{module_name}_{postfix}/conv",
+            f'{module_name}_{postfix}/conv',
             nn.Conv2d(
                 in_channels,
                 out_channels,
@@ -128,16 +141,23 @@ def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1,
                 bias=False,
             ),
         ),
-        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
-        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+        (f'{module_name}_{postfix}/norm', nn.BatchNorm2d(out_channels)),
+        (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)),
     ]
 
 
-def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0):
-    """1x1 convolution with padding"""
+def conv1x1(in_channels,
+            out_channels,
+            module_name,
+            postfix,
+            stride=1,
+            groups=1,
+            kernel_size=1,
+            padding=0):
+    """1x1 convolution with padding."""
     return [
         (
-            f"{module_name}_{postfix}/conv",
+            f'{module_name}_{postfix}/conv',
             nn.Conv2d(
                 in_channels,
                 out_channels,
@@ -148,12 +168,13 @@ def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1,
                 bias=False,
             ),
         ),
-        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
-        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+        (f'{module_name}_{postfix}/norm', nn.BatchNorm2d(out_channels)),
+        (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)),
     ]
 
 
 class Hsigmoid(nn.Module):
+
     def __init__(self, inplace=True):
         super(Hsigmoid, self).__init__()
         self.inplace = inplace
@@ -163,6 +184,7 @@ def forward(self, x):
 
 
 class eSEModule(nn.Module):
+
     def __init__(self, channel, reduction=4):
         super(eSEModule, self).__init__()
         self.avg_pool = nn.AdaptiveAvgPool2d(1)
@@ -178,9 +200,16 @@ def forward(self, x):
 
 
 class _OSA_module(nn.Module):
-    def __init__(
-        self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False
-    ):
+
+    def __init__(self,
+                 in_ch,
+                 stage_ch,
+                 concat_ch,
+                 layer_per_block,
+                 module_name,
+                 SE=False,
+                 identity=False,
+                 depthwise=False):
 
         super(_OSA_module, self).__init__()
 
@@ -192,18 +221,26 @@ def __init__(
         if self.depthwise and in_channel != stage_ch:
             self.isReduced = True
             self.conv_reduction = nn.Sequential(
-                OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0"))
-            )
+                OrderedDict(
+                    conv1x1(in_channel, stage_ch,
+                            '{}_reduction'.format(module_name), '0')))
         for i in range(layer_per_block):
             if self.depthwise:
-                self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i))))
+                self.layers.append(
+                    nn.Sequential(
+                        OrderedDict(
+                            dw_conv3x3(stage_ch, stage_ch, module_name, i))))
             else:
-                self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i))))
+                self.layers.append(
+                    nn.Sequential(
+                        OrderedDict(
+                            conv3x3(in_channel, stage_ch, module_name, i))))
             in_channel = stage_ch
 
         # feature aggregation
         in_channel = in_ch + layer_per_block * stage_ch
-        self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat")))
+        self.concat = nn.Sequential(
+            OrderedDict(conv1x1(in_channel, concat_ch, module_name, 'concat')))
 
         self.ese = eSEModule(concat_ch)
 
@@ -231,44 +268,65 @@ def forward(self, x):
 
 
 class _OSA_stage(nn.Sequential):
-    def __init__(
-        self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False
-    ):
+
+    def __init__(self,
+                 in_ch,
+                 stage_ch,
+                 concat_ch,
+                 block_per_stage,
+                 layer_per_block,
+                 stage_num,
+                 SE=False,
+                 depthwise=False):
 
         super(_OSA_stage, self).__init__()
 
         if not stage_num == 2:
-            self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
+            self.add_module(
+                'Pooling', nn.MaxPool2d(kernel_size=3,
+                                        stride=2,
+                                        ceil_mode=True))
 
         if block_per_stage != 1:
             SE = False
-        module_name = f"OSA{stage_num}_1"
+        module_name = f'OSA{stage_num}_1'
         self.add_module(
-            module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise)
-        )
+            module_name,
+            _OSA_module(in_ch,
+                        stage_ch,
+                        concat_ch,
+                        layer_per_block,
+                        module_name,
+                        SE,
+                        depthwise=depthwise))
         for i in range(block_per_stage - 1):
             if i != block_per_stage - 2:  # last block
                 SE = False
-            module_name = f"OSA{stage_num}_{i + 2}"
+            module_name = f'OSA{stage_num}_{i + 2}'
             self.add_module(
                 module_name,
-                _OSA_module(
-                    concat_ch,
-                    stage_ch,
-                    concat_ch,
-                    layer_per_block,
-                    module_name,
-                    SE,
-                    identity=True,
-                    depthwise=depthwise
-                ),
+                _OSA_module(concat_ch,
+                            stage_ch,
+                            concat_ch,
+                            layer_per_block,
+                            module_name,
+                            SE,
+                            identity=True,
+                            depthwise=depthwise),
             )
 
 
-@BACKBONES.register_module()
+@MODELS.register_module()
 class VoVNet(BaseModule):
-    def __init__(self, spec_name, input_ch=3, out_features=None, 
-                 frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None):
+
+    def __init__(self,
+                 spec_name,
+                 input_ch=3,
+                 out_features=None,
+                 frozen_stages=-1,
+                 norm_eval=True,
+                 pretrained=None,
+                 init_cfg=None):
         """
         Args:
             input_ch(int) : the number of input channel
@@ -285,32 +343,35 @@ def __init__(self, spec_name, input_ch=3, out_features=None,
             self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
         stage_specs = _STAGE_SPECS[spec_name]
 
-        stem_ch = stage_specs["stem"]
-        config_stage_ch = stage_specs["stage_conv_ch"]
-        config_concat_ch = stage_specs["stage_out_ch"]
-        block_per_stage = stage_specs["block_per_stage"]
-        layer_per_block = stage_specs["layer_per_block"]
-        SE = stage_specs["eSE"]
-        depthwise = stage_specs["dw"]
+        stem_ch = stage_specs['stem']
+        config_stage_ch = stage_specs['stage_conv_ch']
+        config_concat_ch = stage_specs['stage_out_ch']
+        block_per_stage = stage_specs['block_per_stage']
+        layer_per_block = stage_specs['layer_per_block']
+        SE = stage_specs['eSE']
+        depthwise = stage_specs['dw']
 
         self._out_features = out_features
 
         # Stem module
         conv_type = dw_conv3x3 if depthwise else conv3x3
-        stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2)
-        stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1)
-        stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2)
-        self.add_module("stem", nn.Sequential((OrderedDict(stem))))
+        stem = conv3x3(input_ch, stem_ch[0], 'stem', '1', 2)
+        stem += conv_type(stem_ch[0], stem_ch[1], 'stem', '2', 1)
+        stem += conv_type(stem_ch[1], stem_ch[2], 'stem', '3', 2)
+        self.add_module('stem', nn.Sequential((OrderedDict(stem))))
         current_stirde = 4
-        self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde}
-        self._out_feature_channels = {"stem": stem_ch[2]}
+        self._out_feature_strides = {
+            'stem': current_stirde,
+            'stage2': current_stirde
+        }
+        self._out_feature_channels = {'stem': stem_ch[2]}
 
         stem_out_ch = [stem_ch[2]]
         in_ch_list = stem_out_ch + config_concat_ch[:-1]
         # OSA stages
         self.stage_names = []
         for i in range(4):  # num_stages
-            name = "stage%d" % (i + 2)  # stage 2 ... stage 5
+            name = 'stage%d' % (i + 2)  # stage 2 ... stage 5
             self.stage_names.append(name)
             self.add_module(
                 name,
@@ -328,7 +389,8 @@ def __init__(self, spec_name, input_ch=3, out_features=None,
 
             self._out_feature_channels[name] = config_concat_ch[i]
             if not i == 0:
-                self._out_feature_strides[name] = current_stirde = int(current_stirde * 2)
+                self._out_feature_strides[name] = current_stirde = int(
+                    current_stirde * 2)
 
         # initialize weights
         # self._initialize_weights()
@@ -341,8 +403,8 @@ def _initialize_weights(self):
     def forward(self, x):
         outputs = {}
         x = self.stem(x)
-        if "stem" in self._out_features:
-            outputs["stem"] = x
+        if 'stem' in self._out_features:
+            outputs['stem'] = x
         for name in self.stage_names:
             x = getattr(self, name)(x)
             if name in self._out_features:
@@ -372,4 +434,4 @@ def train(self, mode=True):
             for m in self.modules():
                 # trick: eval have effect on BatchNorm only
                 if isinstance(m, _BatchNorm):
-                    m.eval()
\ No newline at end of file
+                    m.eval()
diff --git a/projects/mmdet3d_plugin/models/dense_heads/__init__.py b/projects/mmdet3d_plugin/models/dense_heads/__init__.py
index 9284477..ee90407 100755
--- a/projects/mmdet3d_plugin/models/dense_heads/__init__.py
+++ b/projects/mmdet3d_plugin/models/dense_heads/__init__.py
@@ -1,4 +1 @@
-from .dgcnn3d_head import DGCNN3DHead
 from .detr3d_head import Detr3DHead
-
-__all__ = ['DGCNN3DHead', 'Detr3DHead']
\ No newline at end of file
diff --git a/projects/mmdet3d_plugin/models/dense_heads/detr3d_head.py b/projects/mmdet3d_plugin/models/dense_heads/detr3d_head.py
old mode 100755
new mode 100644
index 39803a1..a5eca03
--- a/projects/mmdet3d_plugin/models/dense_heads/detr3d_head.py
+++ b/projects/mmdet3d_plugin/models/dense_heads/detr3d_head.py
@@ -1,22 +1,33 @@
 import copy
+from typing import Dict, List, Optional, Sequence, Tuple
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmcv.cnn import Linear, bias_init_with_prob
-from mmcv.runner import force_fp32
-                        
-from mmdet.core import (multi_apply, multi_apply, reduce_mean)
-from mmdet.models.utils.transformer import inverse_sigmoid
-from mmdet.models import HEADS
+from mmcv.cnn import Linear
+from mmdet3d.models.task_modules.builder import \
+    build_bbox_coder  # need to change
+from mmdet3d.registry import MODELS
 from mmdet.models.dense_heads import DETRHead
-from mmdet3d.core.bbox.coders import build_bbox_coder
-from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
+from mmdet.models.layers import inverse_sigmoid
+from mmdet.models.utils import multi_apply
+from mmdet.registry import TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig, reduce_mean)
+from mmengine.model import bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
 
+from projects.mmdet3d_plugin.models.task_modules.util import normalize_bbox
 
-@HEADS.register_module()
+# from projects.mmdet3d_plugin.models.utils.old_env import force_fp32
+
+
+@MODELS.register_module()
 class Detr3DHead(DETRHead):
-    """Head of Detr3D. 
+    """Head of Detr3D.
+
     Args:
         with_box_refine (bool): Whether to refine the reference points
             in the decoder. Defaults to False.
@@ -24,37 +35,45 @@ class Detr3DHead(DETRHead):
             the outputs of encoder.
         transformer (obj:`ConfigDict`): ConfigDict is used for building
             the Encoder and Decoder.
+        bbox_coder (obj:`ConfigDict`): Configs to build the bbox coder
+        num_cls_fcs (int) : the number of layers in cls and reg branch
+        code_weights (List[double]) : loss weights of (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y)
+        code_size (int) : size of code_weights
     """
-    def __init__(self,
-                 *args,
-                 with_box_refine=False,
-                 as_two_stage=False,
-                 transformer=None,
-                 bbox_coder=None,
-                 num_cls_fcs=2,
-                 code_weights=None,
-                 **kwargs):
+
+    def __init__(
+            self,
+            *args,
+            with_box_refine=False,
+            as_two_stage=False,
+            transformer=None,
+            bbox_coder=None,
+            num_cls_fcs=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+            code_size=10,
+            **kwargs):
         self.with_box_refine = with_box_refine
         self.as_two_stage = as_two_stage
         if self.as_two_stage:
             transformer['as_two_stage'] = self.as_two_stage
-        if 'code_size' in kwargs:
-            self.code_size = kwargs['code_size']
-        else:
-            self.code_size = 10
-        if code_weights is not None:
-            self.code_weights = code_weights
-        else:
-            self.code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
-        
+        self.code_size = code_size
+        self.code_weights = code_weights
+
         self.bbox_coder = build_bbox_coder(bbox_coder)
         self.pc_range = self.bbox_coder.pc_range
         self.num_cls_fcs = num_cls_fcs - 1
-        super(Detr3DHead, self).__init__(
-            *args, transformer=transformer, **kwargs)
-        self.code_weights = nn.Parameter(torch.tensor(
-            self.code_weights, requires_grad=False), requires_grad=False)
-
+        super(Detr3DHead, self).__init__(*args,
+                                         transformer=transformer,
+                                         **kwargs)
+        # DETR sampling=False, so use PseudoSampler, format the result
+        sampler_cfg = dict(type='PseudoSampler')
+        self.sampler = TASK_UTILS.build(sampler_cfg)
+
+        self.code_weights = nn.Parameter(torch.tensor(self.code_weights,
+                                                      requires_grad=False),
+                                         requires_grad=False)
+
+    # forward_train -> loss
     def _init_layers(self):
         """Initialize classification branch and regression branch of head."""
         cls_branch = []
@@ -101,10 +120,12 @@ def init_weights(self):
             for m in self.cls_branches:
                 nn.init.constant_(m[-1].bias, bias_init)
 
-    def forward(self, mlvl_feats, img_metas):
+    def forward(self, mlvl_feats: List[Tensor], img_metas: List[Dict],
+                **kwargs) -> Dict[str, Tensor]:
         """Forward function.
+
         Args:
-            mlvl_feats (tuple[Tensor]): Features from the upstream
+            mlvl_feats (List[Tensor]): Features from the upstream
                 network, each is a 5D-tensor with shape
                 (B, N, C, H, W).
         Returns:
@@ -112,18 +133,16 @@ def forward(self, mlvl_feats, img_metas):
                 shape [nb_dec, bs, num_query, cls_out_channels]. Note \
                 cls_out_channels should includes background.
             all_bbox_preds (Tensor): Sigmoid outputs from the regression \
-                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
-                Shape [nb_dec, bs, num_query, 9].
+                head with normalized coordinate format (cx, cy, l, w, cz, h, sin(φ), cos(φ), vx, vy). \
+                Shape [nb_dec, bs, num_query, 10].
         """
-
         query_embeds = self.query_embedding.weight
-        
         hs, init_reference, inter_references = self.transformer(
             mlvl_feats,
             query_embeds,
-            reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+            reg_branches=self.reg_branches if self.with_box_refine else None,
             img_metas=img_metas,
-        )
+            **kwargs)
         hs = hs.permute(0, 2, 1, 3)
         outputs_classes = []
         outputs_coords = []
@@ -135,17 +154,23 @@ def forward(self, mlvl_feats, img_metas):
                 reference = inter_references[lvl - 1]
             reference = inverse_sigmoid(reference)
             outputs_class = self.cls_branches[lvl](hs[lvl])
-            tmp = self.reg_branches[lvl](hs[lvl])
-
+            tmp = self.reg_branches[lvl](hs[lvl])  # shape: ([B, num_q, 10])
             # TODO: check the shape of reference
             assert reference.shape[-1] == 3
             tmp[..., 0:2] += reference[..., 0:2]
             tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
             tmp[..., 4:5] += reference[..., 2:3]
             tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
-            tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0])
-            tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1])
-            tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] - self.pc_range[2]) + self.pc_range[2])
+
+            tmp[..., 0:1] = \
+                tmp[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) \
+                + self.pc_range[0]
+            tmp[..., 1:2] = \
+                tmp[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) \
+                + self.pc_range[1]
+            tmp[..., 4:5] = \
+                tmp[..., 4:5] * (self.pc_range[5] - self.pc_range[2]) \
+                + self.pc_range[2]
 
             # TODO: check if using sigmoid
             outputs_coord = tmp
@@ -158,50 +183,36 @@ def forward(self, mlvl_feats, img_metas):
             'all_cls_scores': outputs_classes,
             'all_bbox_preds': outputs_coords,
             'enc_cls_scores': None,
-            'enc_bbox_preds': None, 
+            'enc_bbox_preds': None,
         }
         return outs
 
-    def _get_target_single(self,
-                           cls_score,
-                           bbox_pred,
-                           gt_labels,
-                           gt_bboxes,
-                           gt_bboxes_ignore=None):
-        """"Compute regression and classification targets for one image.
-        Outputs from a single decoder layer of a single feature level are used.
-        Args:
-            cls_score (Tensor): Box score logits from a single decoder layer
-                for one image. Shape [num_query, cls_out_channels].
-            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
-                for one image, with normalized coordinate (cx, cy, w, h) and
-                shape [num_query, 4].
-            gt_bboxes (Tensor): Ground truth bboxes for one image with
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (Tensor): Ground truth class indices for one image
-                with shape (num_gts, ).
-            gt_bboxes_ignore (Tensor, optional): Bounding boxes
-                which can be ignored. Default None.
-        Returns:
-            tuple[Tensor]: a tuple containing the following for one image.
-                - labels (Tensor): Labels of each image.
-                - label_weights (Tensor]): Label weights of each image.
-                - bbox_targets (Tensor): BBox targets of each image.
-                - bbox_weights (Tensor): BBox weights of each image.
-                - pos_inds (Tensor): Sampled positive indices for each image.
-                - neg_inds (Tensor): Sampled negative indices for each image.
-        """
-
-        num_bboxes = bbox_pred.size(0)
-        # assigner and sampler
-        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
-                                             gt_labels, gt_bboxes_ignore)
-        sampling_result = self.sampler.sample(assign_result, bbox_pred,
-                                              gt_bboxes)
+    def _get_target_single(
+            self,
+            cls_score: Tensor,  # [query, num_cls]
+            bbox_pred: Tensor,  # [query, 10]
+            gt_instances_3d: InstanceList) -> Tuple[Tensor, ...]:
+        """Compute regression and classification targets for a single image."""
+        # turn bottm center into gravity center
+        gt_bboxes = gt_instances_3d.bboxes_3d  # [num_gt, 9]
+        gt_bboxes = torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1)
+
+        gt_labels = gt_instances_3d.labels_3d  # [num_gt, num_cls]
+        # assigner and sampler: PseudoSampler
+        assign_result = self.assigner.assign(bbox_pred,
+                                             cls_score,
+                                             gt_bboxes,
+                                             gt_labels,
+                                             gt_bboxes_ignore=None)
+        sampling_result = self.sampler.sample(
+            assign_result, InstanceData(priors=bbox_pred),
+            InstanceData(bboxes_3d=gt_bboxes))
         pos_inds = sampling_result.pos_inds
         neg_inds = sampling_result.neg_inds
 
         # label targets
+        num_bboxes = bbox_pred.size(0)
         labels = gt_bboxes.new_full((num_bboxes, ),
                                     self.num_classes,
                                     dtype=torch.long)
@@ -209,36 +220,38 @@ def _get_target_single(self,
         label_weights = gt_bboxes.new_ones(num_bboxes)
 
         # bbox targets
-        bbox_targets = torch.zeros_like(bbox_pred)[..., :9]
+        # theta in gt_bbox here is still a single scalar
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :self.code_size - 1]
         bbox_weights = torch.zeros_like(bbox_pred)
+        # only matched query will learn from bbox coord
         bbox_weights[pos_inds] = 1.0
 
-        # DETR
+        # fix empty gt bug in multi gpu training
+        if sampling_result.pos_gt_bboxes.shape[0] == 0:
+            sampling_result.pos_gt_bboxes = \
+                sampling_result.pos_gt_bboxes.reshape(0, self.code_size - 1)
+
         bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
-        return (labels, label_weights, bbox_targets, bbox_weights, 
-                pos_inds, neg_inds)
-
-    def get_targets(self,
-                    cls_scores_list,
-                    bbox_preds_list,
-                    gt_bboxes_list,
-                    gt_labels_list,
-                    gt_bboxes_ignore_list=None):
-        """"Compute regression and classification targets for a batch image.
-        Outputs from a single decoder layer of a single feature level are used.
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    def get_targets(
+            self,  #get_targets
+            batch_cls_scores: List[Tensor],  # bs[num_q,num_cls]
+            batch_bbox_preds: List[Tensor],  # bs[num_q,10]
+            batch_gt_instances_3d: InstanceList) ->...:
+        """"Compute regression and classification targets for a batch image for
+        a single decoder layer.
+
         Args:
-            cls_scores_list (list[Tensor]): Box score logits from a single
+            batch_cls_scores (list[Tensor]): Box score logits from a single
                 decoder layer for each image with shape [num_query,
                 cls_out_channels].
-            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+            batch_bbox_preds (list[Tensor]): Sigmoid outputs from a single
                 decoder layer for each image, with normalized coordinate
-                (cx, cy, w, h) and shape [num_query, 4].
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
-                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels_list (list[Tensor]): Ground truth class indices for each
-                image with shape (num_gts, ).
-            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
-                boxes which can be ignored for each image. Default None.
+                (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) and shape [num_query, 10]
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes_3d``、``labels_3d``.
         Returns:
             tuple: a tuple containing the following targets.
                 - labels_list (list[Tensor]): Labels for all images.
@@ -253,52 +266,44 @@ def get_targets(self,
                 - num_total_neg (int): Number of negative samples in all \
                     images.
         """
-        assert gt_bboxes_ignore_list is None, \
-            'Only supports for gt_bboxes_ignore setting to None.'
-        num_imgs = len(cls_scores_list)
-        gt_bboxes_ignore_list = [
-            gt_bboxes_ignore_list for _ in range(num_imgs)
-        ]
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pos_inds_list, neg_inds_list) = multi_apply(self._get_target_single,
+                                                     batch_cls_scores,
+                                                     batch_bbox_preds,
+                                                     batch_gt_instances_3d)
 
-        (labels_list, label_weights_list, bbox_targets_list,
-         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
-             self._get_target_single, cls_scores_list, bbox_preds_list,
-             gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list)
         num_total_pos = sum((inds.numel() for inds in pos_inds_list))
         num_total_neg = sum((inds.numel() for inds in neg_inds_list))
         return (labels_list, label_weights_list, bbox_targets_list,
                 bbox_weights_list, num_total_pos, num_total_neg)
 
-    def loss_single(self,
-                    cls_scores,
-                    bbox_preds,
-                    gt_bboxes_list,
-                    gt_labels_list,
-                    gt_bboxes_ignore_list=None):
+    def loss_by_feat_single(
+            self,
+            batch_cls_scores: Tensor,  # bs,num_q,num_cls
+            batch_bbox_preds: Tensor,  # bs,num_q,10
+            batch_gt_instances_3d: InstanceList) ->...:
         """"Loss function for outputs from a single decoder layer of a single
         feature level.
+
         Args:
-            cls_scores (Tensor): Box score logits from a single decoder layer
-                for all images. Shape [bs, num_query, cls_out_channels].
-            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
-                for all images, with normalized coordinate (cx, cy, w, h) and
-                shape [bs, num_query, 4].
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
-                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels_list (list[Tensor]): Ground truth class indices for each
-                image with shape (num_gts, ).
-            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
-                boxes which can be ignored for each image. Default None.
+           batch_cls_scores (Tensor): Box score logits from a single
+                decoder layer for batched images with shape [num_query,
+                cls_out_channels].
+            batch_bbox_preds (Tensor): Sigmoid outputs from a single
+                decoder layer for batched images, with normalized coordinate
+                (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) and shape [num_query, 10]
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、``labels_3d``.
         Returns:
-            dict[str, Tensor]: A dictionary of loss components for outputs from
+            tulple(Tensor, Tensor): cls and reg loss for outputs from
                 a single decoder layer.
         """
-        num_imgs = cls_scores.size(0)
-        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
-        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        batch_size = batch_cls_scores.size(0)  # batch size
+        cls_scores_list = [batch_cls_scores[i] for i in range(batch_size)]
+        bbox_preds_list = [batch_bbox_preds[i] for i in range(batch_size)]
         cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
-                                           gt_bboxes_list, gt_labels_list, 
-                                           gt_bboxes_ignore_list)
+                                           batch_gt_instances_3d)
+
         (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
          num_total_pos, num_total_neg) = cls_reg_targets
         labels = torch.cat(labels_list, 0)
@@ -307,105 +312,87 @@ def loss_single(self,
         bbox_weights = torch.cat(bbox_weights_list, 0)
 
         # classification loss
-        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        batch_cls_scores = batch_cls_scores.reshape(-1, self.cls_out_channels)
         # construct weighted avg_factor to match with the official DETR repo
         cls_avg_factor = num_total_pos * 1.0 + \
             num_total_neg * self.bg_cls_weight
         if self.sync_cls_avg_factor:
             cls_avg_factor = reduce_mean(
-                cls_scores.new_tensor([cls_avg_factor]))
+                batch_cls_scores.new_tensor([cls_avg_factor]))
 
         cls_avg_factor = max(cls_avg_factor, 1)
-        loss_cls = self.loss_cls(
-            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+        loss_cls = self.loss_cls(batch_cls_scores,
+                                 labels,
+                                 label_weights,
+                                 avg_factor=cls_avg_factor)
 
-        # Compute the average number of gt boxes accross all gpus, for
+        # Compute the average number of gt boxes across all gpus, for
         # normalization purposes
         num_total_pos = loss_cls.new_tensor([num_total_pos])
         num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
 
         # regression L1 loss
-        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        batch_bbox_preds = batch_bbox_preds.reshape(-1,
+                                                    batch_bbox_preds.size(-1))
         normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
+        # neg_query is all 0, log(0) is NaN
         isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
         bbox_weights = bbox_weights * self.code_weights
 
         loss_bbox = self.loss_bbox(
-                bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan, :10], bbox_weights[isnotnan, :10], avg_factor=num_total_pos)
+            batch_bbox_preds[isnotnan, :self.code_size],
+            normalized_bbox_targets[isnotnan, :self.code_size],
+            bbox_weights[isnotnan, :self.code_size],
+            avg_factor=num_total_pos)
 
         loss_cls = torch.nan_to_num(loss_cls)
         loss_bbox = torch.nan_to_num(loss_bbox)
         return loss_cls, loss_bbox
-    
-    @force_fp32(apply_to=('preds_dicts'))
-    def loss(self,
-             gt_bboxes_list,
-             gt_labels_list,
-             preds_dicts,
-             gt_bboxes_ignore=None):
-        """"Loss function.
+
+    # original loss()
+    # @force_fp32(apply_to=('preds_dicts'))
+    def loss_by_feat(
+            self,
+            batch_gt_instances_3d: InstanceList,
+            preds_dicts: Dict[str, Tensor],
+            batch_gt_instances_3d_ignore: OptInstanceList = None) -> Dict:
+        """Compute loss of the head.
+
         Args:
-            
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
-                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels_list (list[Tensor]): Ground truth class indices for each
-                image with shape (num_gts, ).
-            preds_dicts:
-                all_cls_scores (Tensor): Classification score of all
-                    decoder layers, has shape
-                    [nb_dec, bs, num_query, cls_out_channels].
-                all_bbox_preds (Tensor): Sigmoid regression
-                    outputs of all decode layers. Each is a 4D-tensor with
-                    normalized coordinate format (cx, cy, w, h) and shape
-                    [nb_dec, bs, num_query, 4].
-                enc_cls_scores (Tensor): Classification scores of
-                    points on encode feature map , has shape
-                    (N, h*w, num_classes). Only be passed when as_two_stage is
-                    True, otherwise is None.
-                enc_bbox_preds (Tensor): Regression results of each points
-                    on the encode feature map, has shape (N, h*w, 4). Only be
-                    passed when as_two_stage is True, otherwise is None.
-            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
-                which can be ignored for each image. Default None.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、`
+                `labels_3d``、``depths``、``centers_2d`` and attributes.
+                gt_instance.  It usually includes ``bboxes``、``labels``.
+            batch_gt_instances_3d_ignore (list[:obj:`InstanceData`], Optional):
+                NOT supported.
+                Defaults to None.
+
         Returns:
             dict[str, Tensor]: A dictionary of loss components.
         """
-        assert gt_bboxes_ignore is None, \
+        assert batch_gt_instances_3d_ignore is None, \
             f'{self.__class__.__name__} only supports ' \
-            f'for gt_bboxes_ignore setting to None.'
-
-        all_cls_scores = preds_dicts['all_cls_scores']
-        all_bbox_preds = preds_dicts['all_bbox_preds']
+            f'for batch_gt_instances_3d_ignore setting to None.'
+        all_cls_scores = preds_dicts[
+            'all_cls_scores']  # num_dec,bs,num_q,num_cls
+        all_bbox_preds = preds_dicts['all_bbox_preds']  # num_dec,bs,num_q,10
         enc_cls_scores = preds_dicts['enc_cls_scores']
         enc_bbox_preds = preds_dicts['enc_bbox_preds']
 
+        # calculate loss for each decoder layer
         num_dec_layers = len(all_cls_scores)
-        device = gt_labels_list[0].device
-        gt_bboxes_list = [torch.cat(
-            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
-            dim=1).to(device) for gt_bboxes in gt_bboxes_list]
-
-        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
-        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
-        all_gt_bboxes_ignore_list = [
-            gt_bboxes_ignore for _ in range(num_dec_layers)
+        batch_gt_instances_3d_list = [
+            batch_gt_instances_3d for _ in range(num_dec_layers)
         ]
-
-        losses_cls, losses_bbox = multi_apply(
-            self.loss_single, all_cls_scores, all_bbox_preds,
-            all_gt_bboxes_list, all_gt_labels_list, 
-            all_gt_bboxes_ignore_list)
+        losses_cls, losses_bbox = multi_apply(self.loss_by_feat_single,
+                                              all_cls_scores, all_bbox_preds,
+                                              batch_gt_instances_3d_list)
 
         loss_dict = dict()
         # loss of proposal generated from encode feature map.
         if enc_cls_scores is not None:
-            binary_labels_list = [
-                torch.zeros_like(gt_labels_list[i])
-                for i in range(len(all_gt_labels_list))
-            ]
             enc_loss_cls, enc_losses_bbox = \
-                self.loss_single(enc_cls_scores, enc_bbox_preds,
-                                 gt_bboxes_list, binary_labels_list, gt_bboxes_ignore)
+                self.loss_by_feat_single(enc_cls_scores, enc_bbox_preds, batch_gt_instances_3d_list)
             loss_dict['enc_loss_cls'] = enc_loss_cls
             loss_dict['enc_loss_bbox'] = enc_losses_bbox
 
@@ -415,31 +402,56 @@ def loss(self,
 
         # loss from other decoder layers
         num_dec_layer = 0
-        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1],
-                                           losses_bbox[:-1]):
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
             loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
             loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
             num_dec_layer += 1
         return loss_dict
 
-    @force_fp32(apply_to=('preds_dicts'))
-    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
-        """Generate bboxes from bbox head predictions.
+    # @force_fp32(apply_to=('preds_dicts'))
+    def predict_by_feat(self,
+                        preds_dicts,
+                        img_metas,
+                        rescale=False) -> InstanceList:
+        """Transform network output for a batch into bbox predictions.
+
         Args:
-            preds_dicts (tuple[list[dict]]): Prediction results.
-            img_metas (list[dict]): Point cloud and image's meta info.
+            preds_dicts (Dict[str, Tensor]):
+                -all_cls_scores (Tensor): Outputs from the classification head,
+                    shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                    cls_out_channels should includes background.
+                -all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                    head with normalized coordinate format (cx, cy, l, w, cz, h, rot_sine, rot_cosine, v_x, v_y).
+                    Shape [nb_dec, bs, num_query, 10].
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
         Returns:
-            list[dict]: Decoded bbox, scores and labels after nms.
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes_3d (Tensor): Contains a tensor with shape
+                  (num_instances, C), where C >= 7.
         """
+        # sinθ & cosθ ---> θ
         preds_dicts = self.bbox_coder.decode(preds_dicts)
-        num_samples = len(preds_dicts)
+        num_samples = len(preds_dicts)  # batch size
         ret_list = []
         for i in range(num_samples):
+            results = InstanceData()
             preds = preds_dicts[i]
             bboxes = preds['bboxes']
             bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
-            bboxes = img_metas[i]['box_type_3d'](bboxes, 9)
-            scores = preds['scores']
-            labels = preds['labels']
-            ret_list.append([bboxes, scores, labels])
+            bboxes = img_metas[i]['box_type_3d'](bboxes, self.code_size - 1)
+
+            results.bboxes_3d = bboxes
+            results.scores_3d = preds['scores']
+            results.labels_3d = preds['labels']
+            ret_list.append(results)
         return ret_list
diff --git a/projects/mmdet3d_plugin/models/dense_heads/dgcnn3d_head.py b/projects/mmdet3d_plugin/models/dense_heads/dgcnn3d_head.py
deleted file mode 100755
index 1f6e7e9..0000000
--- a/projects/mmdet3d_plugin/models/dense_heads/dgcnn3d_head.py
+++ /dev/null
@@ -1,484 +0,0 @@
-import copy
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import Linear, bias_init_with_prob
-from mmcv.runner import force_fp32, auto_fp16
-                        
-from mmdet.core import multi_apply, reduce_mean
-from mmdet.models.utils.transformer import inverse_sigmoid
-from mmdet.models import HEADS
-from mmdet.models.dense_heads import DETRHead
-from mmdet3d.core.bbox.coders import build_bbox_coder
-from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
-
-
-@HEADS.register_module()
-class DGCNN3DHead(DETRHead):
-    """Head of DeformDETR3D. 
-    Args:
-        with_box_refine (bool): Whether to refine the reference points
-            in the decoder. Defaults to False.
-        as_two_stage (bool) : Whether to generate the proposal from
-            the outputs of encoder.
-        transformer (obj:`ConfigDict`): ConfigDict is used for building
-            the Encoder and Decoder.
-    """
-    def __init__(self,
-                 *args,
-                 with_box_refine=False,
-                 as_two_stage=False,
-                 transformer=None,
-                 bbox_coder=None,
-                 num_cls_fcs=2,
-                 **kwargs):
-        self.with_box_refine = with_box_refine
-        self.as_two_stage = as_two_stage
-        if self.as_two_stage:
-            transformer['as_two_stage'] = self.as_two_stage
-        if 'code_size' in kwargs:
-            self.code_size = kwargs['code_size']
-        else:
-            self.code_size = 10
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-        self.pc_range = self.bbox_coder.pc_range
-        self.voxel_size = self.bbox_coder.voxel_size
-
-        self.bev_shape = (int((self.pc_range[3] - self.pc_range[0]) / self.voxel_size[0]), 
-                          int((self.pc_range[4] - self.pc_range[1]) / self.voxel_size[1]))
-
-        self.num_cls_fcs = num_cls_fcs - 1
-        super(DGCNN3DHead, self).__init__(
-            *args, transformer=transformer, **kwargs)
-    
-
-    def _init_layers(self):
-        """Initialize classification branch and regression branch of head."""
-        cls_branch = []
-        for _ in range(self.num_reg_fcs):
-            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
-            cls_branch.append(nn.LayerNorm(self.embed_dims))
-            cls_branch.append(nn.ReLU(inplace=True))
-        cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
-        fc_cls = nn.Sequential(*cls_branch)
-
-        reg_branch = []
-        for _ in range(self.num_reg_fcs):
-            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
-            # reg_branch.append(nn.LayerNorm(self.embed_dims))
-            reg_branch.append(nn.ReLU())
-        reg_branch.append(Linear(self.embed_dims, self.code_size))
-        reg_branch = nn.Sequential(*reg_branch)
-
-        def _get_clones(module, N):
-            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
-
-        # last reg_branch is used to generate proposal from
-        # encode feature map when as_two_stage is True.
-        num_pred = (self.transformer.decoder.num_layers + 1) if \
-            self.as_two_stage else self.transformer.decoder.num_layers
-
-        if self.with_box_refine:
-            self.cls_branches = _get_clones(fc_cls, num_pred)
-            self.reg_branches = _get_clones(reg_branch, num_pred)
-        else:
-            self.cls_branches = nn.ModuleList(
-                [fc_cls for _ in range(num_pred)])
-            self.reg_branches = nn.ModuleList(
-                [reg_branch for _ in range(num_pred)])
-
-        if not self.as_two_stage:
-            self.query_embedding = nn.Embedding(self.num_query,
-                                                self.embed_dims * 2)
-
-    def init_weights(self):
-        """Initialize weights of the head."""
-        self.transformer.init_weights()
-        if self.loss_cls.use_sigmoid:
-            bias_init = bias_init_with_prob(0.01)
-            for m in self.cls_branches:
-                nn.init.constant_(m[-1].bias, bias_init)
-        if self.as_two_stage:
-            for m in self.reg_branches:
-                nn.init.constant_(m[-1].bias.data[2:], 0.0)
-
-    def forward(self, mlvl_feats):
-        """Forward function.
-        Args:
-            mlvl_feats (tuple[Tensor]): Features from the upstream
-                network, each is a 4D-tensor with shape
-                (N, C, H, W).
-        Returns:
-            all_cls_scores (Tensor): Outputs from the classification head, \
-                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
-                cls_out_channels should includes background.
-            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
-                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
-                Shape [nb_dec, bs, num_query, 9].
-            enc_outputs_class (Tensor): The score of each point on encode \
-                feature map, has shape (N, h*w, num_class). Only when \
-                as_two_stage is Ture it would be returned, otherwise \
-                `None` would be returned.
-            enc_outputs_coord (Tensor): The proposal generate from the \
-                encode feature map, has shape (N, h*w, 4). Only when \
-                as_two_stage is Ture it would be returned, otherwise \
-                `None` would be returned.
-        """
-
-        batch_size = mlvl_feats[0].size(0)
-        input_img_h, input_img_w = self.bev_shape
-        img_masks = mlvl_feats[0].new_zeros(
-            (batch_size, input_img_h, input_img_w))
-
-        mlvl_masks = []
-        mlvl_positional_encodings = []
-        for feat in mlvl_feats:
-            mlvl_masks.append(
-                F.interpolate(img_masks[None],
-                              size=feat.shape[-2:]).to(torch.bool).squeeze(0))
-            mlvl_positional_encodings.append(
-                self.positional_encoding(mlvl_masks[-1]))
-
-        query_embeds = None
-        if not self.as_two_stage:
-            query_embeds = self.query_embedding.weight
-        hs, init_reference, inter_references, \
-            enc_outputs_class, enc_outputs_coord = self.transformer(
-                    mlvl_feats,
-                    mlvl_masks,
-                    query_embeds,
-                    mlvl_positional_encodings,
-                    reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
-                    cls_branches=self.cls_branches if self.as_two_stage else None  # noqa:E501
-            )
-        hs = hs.permute(0, 2, 1, 3)
-        outputs_classes = []
-        outputs_coords = []
-
-        for lvl in range(hs.shape[0]):
-            if lvl == 0:
-                reference = init_reference
-            else:
-                reference = inter_references[lvl - 1]
-            reference = inverse_sigmoid(reference)
-            outputs_class = self.cls_branches[lvl](hs[lvl])
-            tmp = self.reg_branches[lvl](hs[lvl])
-
-            if reference.shape[-1] == 4:
-                tmp += reference
-            else:
-                assert reference.shape[-1] == 2
-                tmp[..., 0:2] += reference
-                tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
-                tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0])
-                tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1])
-
-            if tmp.size(-1) > 8:
-                outputs_coord = torch.cat((tmp[..., :6], tmp[..., 6:8], tmp[..., 8:]), -1)
-            else:
-                outputs_coord = torch.cat((tmp[..., :6], tmp[..., 6:8]), -1)
-            outputs_classes.append(outputs_class)
-            outputs_coords.append(outputs_coord)
-
-        outputs_classes = torch.stack(outputs_classes)
-        outputs_coords = torch.stack(outputs_coords)
-        if self.as_two_stage:
-            outs = {
-                'all_cls_scores': outputs_classes,
-                'all_bbox_preds': outputs_coords,
-                'enc_cls_scores': enc_outputs_class,
-                'enc_bbox_preds': enc_outputs_coord.sigmoid(), 
-            }
-        else:
-            outs = {
-                'all_cls_scores': outputs_classes,
-                'all_bbox_preds': outputs_coords,
-                'enc_cls_scores': None,
-                'enc_bbox_preds': None, 
-            }
-        return outs
-
-    def _get_target_single(self,
-                           cls_score,
-                           bbox_pred,
-                           gt_bboxes,
-                           gt_labels,
-                           gt_bboxes_ignore=None):
-        """"Compute regression and classification targets for one image.
-        Outputs from a single decoder layer of a single feature level are used.
-        Args:
-            cls_score (Tensor): Box score logits from a single decoder layer
-                for one image. Shape [num_query, cls_out_channels].
-            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
-                for one image, with normalized coordinate (cx, cy, w, h) and
-                shape [num_query, 4].
-            gt_bboxes (Tensor): Ground truth bboxes for one image with
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (Tensor): Ground truth class indices for one image
-                with shape (num_gts, ).
-            gt_bboxes_ignore (Tensor, optional): Bounding boxes
-                which can be ignored. Default None.
-        Returns:
-            tuple[Tensor]: a tuple containing the following for one image.
-                - labels (Tensor): Labels of each image.
-                - label_weights (Tensor]): Label weights of each image.
-                - bbox_targets (Tensor): BBox targets of each image.
-                - bbox_weights (Tensor): BBox weights of each image.
-                - pos_inds (Tensor): Sampled positive indices for each image.
-                - neg_inds (Tensor): Sampled negative indices for each image.
-        """
-
-        num_bboxes = bbox_pred.size(0)
-        # assigner and sampler
-        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
-                                             gt_labels, gt_bboxes_ignore)
-        sampling_result = self.sampler.sample(assign_result, bbox_pred,
-                                              gt_bboxes)
-        pos_inds = sampling_result.pos_inds
-        neg_inds = sampling_result.neg_inds
-
-        # label targets
-        labels = gt_bboxes.new_full((num_bboxes, ),
-                                    self.num_classes,
-                                    dtype=torch.long)
-        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
-        label_weights = gt_bboxes.new_ones(num_bboxes)
-
-        # bbox targets
-        bbox_targets = torch.zeros_like(bbox_pred)[..., :self.code_size-1]
-        bbox_weights = torch.zeros_like(bbox_pred)
-        bbox_weights[pos_inds] = 1.0
-
-        # DETR
-        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
-        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
-                neg_inds)
-
-    def get_targets(self,
-                    cls_scores_list,
-                    bbox_preds_list,
-                    gt_bboxes_list,
-                    gt_labels_list,
-                    gt_bboxes_ignore_list=None):
-        """"Compute regression and classification targets for a batch image.
-        Outputs from a single decoder layer of a single feature level are used.
-        Args:
-            cls_scores_list (list[Tensor]): Box score logits from a single
-                decoder layer for each image with shape [num_query,
-                cls_out_channels].
-            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
-                decoder layer for each image, with normalized coordinate
-                (cx, cy, w, h) and shape [num_query, 4].
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
-                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels_list (list[Tensor]): Ground truth class indices for each
-                image with shape (num_gts, ).
-            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
-                boxes which can be ignored for each image. Default None.
-        Returns:
-            tuple: a tuple containing the following targets.
-                - labels_list (list[Tensor]): Labels for all images.
-                - label_weights_list (list[Tensor]): Label weights for all \
-                    images.
-                - bbox_targets_list (list[Tensor]): BBox targets for all \
-                    images.
-                - bbox_weights_list (list[Tensor]): BBox weights for all \
-                    images.
-                - num_total_pos (int): Number of positive samples in all \
-                    images.
-                - num_total_neg (int): Number of negative samples in all \
-                    images.
-        """
-        assert gt_bboxes_ignore_list is None, \
-            'Only supports for gt_bboxes_ignore setting to None.'
-        num_imgs = len(cls_scores_list)
-        gt_bboxes_ignore_list = [
-            gt_bboxes_ignore_list for _ in range(num_imgs)
-        ]
-
-        (labels_list, label_weights_list, bbox_targets_list,
-         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
-             self._get_target_single, cls_scores_list, bbox_preds_list,
-             gt_bboxes_list, gt_labels_list, gt_bboxes_ignore_list)
-        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
-        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
-        return (labels_list, label_weights_list, bbox_targets_list,
-                bbox_weights_list, num_total_pos, num_total_neg)
-
-    def loss_single(self,
-                    cls_scores,
-                    bbox_preds,
-                    gt_bboxes_list,
-                    gt_labels_list,
-                    gt_bboxes_ignore_list=None):
-        """"Loss function for outputs from a single decoder layer of a single
-        feature level.
-        Args:
-            cls_scores (Tensor): Box score logits from a single decoder layer
-                for all images. Shape [bs, num_query, cls_out_channels].
-            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
-                for all images, with normalized coordinate (cx, cy, w, h) and
-                shape [bs, num_query, 4].
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
-                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels_list (list[Tensor]): Ground truth class indices for each
-                image with shape (num_gts, ).
-            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
-                boxes which can be ignored for each image. Default None.
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components for outputs from
-                a single decoder layer.
-        """
-        num_imgs = cls_scores.size(0)
-        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
-        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
-        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
-                                           gt_bboxes_list, gt_labels_list, gt_bboxes_ignore_list)
-        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
-         num_total_pos, num_total_neg) = cls_reg_targets
-        labels = torch.cat(labels_list, 0)
-        label_weights = torch.cat(label_weights_list, 0)
-        bbox_targets = torch.cat(bbox_targets_list, 0)
-        bbox_weights = torch.cat(bbox_weights_list, 0)
-
-        # classification loss
-        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
-        # construct weighted avg_factor to match with the official DETR repo
-        cls_avg_factor = num_total_pos * 1.0 + \
-            num_total_neg * self.bg_cls_weight
-        if self.sync_cls_avg_factor:
-            cls_avg_factor = reduce_mean(
-                cls_scores.new_tensor([cls_avg_factor]))
-
-        cls_avg_factor = max(cls_avg_factor, 1)
-        loss_cls = self.loss_cls(
-            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
-
-        # Compute the average number of gt boxes accross all gpus, for
-        # normalization purposes
-        num_total_pos = loss_cls.new_tensor([num_total_pos])
-        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
-
-        # regression L1 loss
-        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
-        normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
-        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
-        loss_bbox = self.loss_bbox(
-                bbox_preds[isnotnan, :8], normalized_bbox_targets[isnotnan, :8], bbox_weights[isnotnan, :8], avg_factor=num_total_pos)
-        if self.code_size > 8:
-            loss_bbox_vel = self.loss_bbox(
-                    bbox_preds[isnotnan, 8:], normalized_bbox_targets[isnotnan, 8:], bbox_weights[isnotnan, 8:], avg_factor=num_total_pos)
-            loss_bbox = loss_bbox + loss_bbox_vel * 0.2
-
-        return loss_cls, loss_bbox 
-    
-    @force_fp32(apply_to=('preds_dicts'))
-    def loss(self,
-             gt_bboxes_list,
-             gt_labels_list,
-             preds_dicts,
-             gt_bboxes_ignore=None):
-        """"Loss function.
-        Args:
-            
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
-                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels_list (list[Tensor]): Ground truth class indices for each
-                image with shape (num_gts, ).
-            preds_dicts:
-                all_cls_scores (Tensor): Classification score of all
-                    decoder layers, has shape
-                    [nb_dec, bs, num_query, cls_out_channels].
-                all_bbox_preds (Tensor): Sigmoid regression
-                    outputs of all decode layers. Each is a 4D-tensor with
-                    normalized coordinate format (cx, cy, w, h) and shape
-                    [nb_dec, bs, num_query, 4].
-                enc_cls_scores (Tensor): Classification scores of
-                    points on encode feature map , has shape
-                    (N, h*w, num_classes). Only be passed when as_two_stage is
-                    True, otherwise is None.
-                enc_bbox_preds (Tensor): Regression results of each points
-                    on the encode feature map, has shape (N, h*w, 4). Only be
-                    passed when as_two_stage is True, otherwise is None.
-            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
-                which can be ignored for each image. Default None.
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-        assert gt_bboxes_ignore is None, \
-            f'{self.__class__.__name__} only supports ' \
-            f'for gt_bboxes_ignore setting to None.'
-
-        all_cls_scores = preds_dicts['all_cls_scores']
-        all_bbox_preds = preds_dicts['all_bbox_preds']
-        enc_cls_scores = preds_dicts['enc_cls_scores']
-        enc_bbox_preds = preds_dicts['enc_bbox_preds']
-
-        num_dec_layers = len(all_cls_scores)
-        device = gt_labels_list[0].device
-        gt_bboxes_list = [torch.cat(
-            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
-            dim=1).to(device) for gt_bboxes in gt_bboxes_list]
-
-        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
-        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
-        all_gt_bboxes_ignore_list = [
-            gt_bboxes_ignore for _ in range(num_dec_layers)
-        ]
-
-        losses_cls, losses_bbox = multi_apply(
-            self.loss_single, all_cls_scores, all_bbox_preds,
-            all_gt_bboxes_list, all_gt_labels_list, 
-            all_gt_bboxes_ignore_list)
-
-        loss_dict = dict()
-        # loss of proposal generated from encode feature map.
-        if enc_cls_scores is not None:
-            binary_labels_list = [
-                torch.zeros_like(gt_labels_list[i])
-                for i in range(len(all_gt_labels_list))
-            ]
-            enc_loss_cls, enc_losses_bbox = \
-                self.loss_single(enc_cls_scores, enc_bbox_preds,
-                                 gt_bboxes_list, binary_labels_list, gt_bboxes_ignore)
-            loss_dict['enc_loss_cls'] = enc_loss_cls
-            loss_dict['enc_loss_bbox'] = enc_losses_bbox
-
-        # loss from the last decoder layer
-        loss_dict['loss_cls'] = losses_cls[-1]
-        loss_dict['loss_bbox'] = losses_bbox[-1]
-
-        # loss from other decoder layers
-        num_dec_layer = 0
-        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1],
-                                           losses_bbox[:-1]):
-            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
-            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
-            num_dec_layer += 1
-        return loss_dict
-
-    @force_fp32(apply_to=('preds_dicts'))
-    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
-        """Generate bboxes from bbox head predictions.
-        Args:
-            preds_dicts (tuple[list[dict]]): Prediction results.
-            img_metas (list[dict]): Point cloud and image's meta info.
-        Returns:
-            list[dict]: Decoded bbox, scores and labels after nms.
-        """
-        preds_dicts = self.bbox_coder.decode(preds_dicts)
-        num_samples = len(preds_dicts)
-        ret_list = []
-        for i in range(num_samples):
-            preds = preds_dicts[i]
-            bboxes = preds['bboxes']
-            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
-            if bboxes.size(-1) == 9:
-                bboxes = img_metas[i]['box_type_3d'](bboxes, 9)
-            else:
-                bboxes = img_metas[i]['box_type_3d'](bboxes, 7)
-            scores = preds['scores']
-            labels = preds['labels']
-            ret_list.append([bboxes, scores, labels])
-        return ret_list
\ No newline at end of file
diff --git a/projects/mmdet3d_plugin/models/detectors/__init__.py b/projects/mmdet3d_plugin/models/detectors/__init__.py
index ec02344..9fb9fa3 100755
--- a/projects/mmdet3d_plugin/models/detectors/__init__.py
+++ b/projects/mmdet3d_plugin/models/detectors/__init__.py
@@ -1,4 +1,4 @@
-from .obj_dgcnn import ObjDGCNN
 from .detr3d import Detr3D
+from .detr3d_old import Detr3D_old
 
-__all__ = [ 'ObjDGCNN', 'Detr3D']
+# from .visualizer_zlt import *
diff --git a/projects/mmdet3d_plugin/models/detectors/detr3d.py b/projects/mmdet3d_plugin/models/detectors/detr3d.py
old mode 100755
new mode 100644
index 5c84fb1..63613b2
--- a/projects/mmdet3d_plugin/models/detectors/detr3d.py
+++ b/projects/mmdet3d_plugin/models/detectors/detr3d.py
@@ -1,48 +1,92 @@
-import torch
+import os
+import time
+from typing import Dict, List, Optional, Sequence
 
-from mmcv.runner import force_fp32, auto_fp16
-from mmdet.models import DETECTORS
-from mmdet3d.core import bbox3d2result
+import numpy as np
+import torch
 from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from torch import Tensor
+
 from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask
 
 
-@DETECTORS.register_module()
+# import torchvision.utils as vutils
+# from mmdet3d.structures.bbox_3d.utils import get_lidar2img
+# from .visualizer_zlt import *
+# import cv2
+@MODELS.register_module()
 class Detr3D(MVXTwoStageDetector):
-    """Detr3D."""
+    """DETR3D: 3D Object Detection from Multi-view Images via 3D-to-2D Queries
+
+    Args:
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`Det3DDataPreprocessor`. Defaults to None.
+        use_grid_mask (bool) : Data augmentation. Whether to mask out some
+            grids during extract_img_feat. Defaults to False.
+        img_backbone (dict, optional): Backbone of extracting
+            images feature. Defaults to None.
+        img_neck (dict, optional): Neck of extracting
+            image features. Defaults to None.
+        pts_bbox_head (dict, optional): Bboxes head of
+            detr3d. Defaults to None.
+        train_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        test_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+    """
 
     def __init__(self,
+                 data_preprocessor=None,
                  use_grid_mask=False,
-                 pts_voxel_layer=None,
-                 pts_voxel_encoder=None,
-                 pts_middle_encoder=None,
-                 pts_fusion_layer=None,
                  img_backbone=None,
-                 pts_backbone=None,
                  img_neck=None,
-                 pts_neck=None,
                  pts_bbox_head=None,
-                 img_roi_head=None,
-                 img_rpn_head=None,
                  train_cfg=None,
                  test_cfg=None,
-                 pretrained=None):
-        super(Detr3D,
-              self).__init__(pts_voxel_layer, pts_voxel_encoder,
-                             pts_middle_encoder, pts_fusion_layer,
-                             img_backbone, pts_backbone, img_neck, pts_neck,
-                             pts_bbox_head, img_roi_head, img_rpn_head,
-                             train_cfg, test_cfg, pretrained)
-        self.grid_mask = GridMask(True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
+                 pretrained=None,
+                 debug_name=None,
+                 gtvis_range=[0, 105],
+                 vis_count=None):
+        super(Detr3D, self).__init__(img_backbone=img_backbone,
+                                     img_neck=img_neck,
+                                     pts_bbox_head=pts_bbox_head,
+                                     train_cfg=train_cfg,
+                                     test_cfg=test_cfg,
+                                     data_preprocessor=data_preprocessor)
+        self.grid_mask = GridMask(True,
+                                  True,
+                                  rotate=1,
+                                  offset=False,
+                                  ratio=0.5,
+                                  mode=1,
+                                  prob=0.7)
         self.use_grid_mask = use_grid_mask
+        self.debug_name = debug_name
+        self.gtvis_range = gtvis_range
+        self.vis_count = vis_count
+
+    def extract_img_feat(self, img: Tensor,
+                         batch_input_metas: List[dict]) -> List[Tensor]:
+        """Extract features from images.
+
+        Args:
+            img (tensor): Batched multi-view image tensor with shape (B, N, C, H, W).
+            batch_input_metas (list[dict]): Meta information of multiple inputs
+                in a batch.
+
+        Returns:
+             list[tensor]: multi-level image features.
+        """
 
-    def extract_img_feat(self, img, img_metas):
-        """Extract features of images."""
         B = img.size(0)
         if img is not None:
-            input_shape = img.shape[-2:]
+            input_shape = img.shape[-2:]  # bs nchw
             # update real input shape of each single img
-            for img_meta in img_metas:
+            for img_meta in batch_input_metas:
                 img_meta.update(input_shape=input_shape)
 
             if img.dim() == 5 and img.size(0) == 1:
@@ -51,7 +95,7 @@ def extract_img_feat(self, img, img_metas):
                 B, N, C, H, W = img.size()
                 img = img.view(B * N, C, H, W)
             if self.use_grid_mask:
-                img = self.grid_mask(img)
+                img = self.grid_mask(img)  # mask out some grids
             img_feats = self.img_backbone(img)
             if isinstance(img_feats, dict):
                 img_feats = list(img_feats.values())
@@ -59,158 +103,142 @@ def extract_img_feat(self, img, img_metas):
             return None
         if self.with_img_neck:
             img_feats = self.img_neck(img_feats)
+
         img_feats_reshaped = []
         for img_feat in img_feats:
             BN, C, H, W = img_feat.size()
             img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
         return img_feats_reshaped
 
-    @auto_fp16(apply_to=('img'), out_fp32=True)
-    def extract_feat(self, img, img_metas):
-        """Extract features from images and points."""
-        img_feats = self.extract_img_feat(img, img_metas)
+    def extract_feat(self, batch_inputs_dict: Dict,
+                     batch_input_metas: List[dict]) -> List[Tensor]:
+        """Extract features from images.
+
+        Refer to self.extract_img_feat()
+        """
+        imgs = batch_inputs_dict.get('imgs', None)
+        img_feats = self.extract_img_feat(imgs, batch_input_metas)
         return img_feats
 
-    def forward_pts_train(self,
-                          pts_feats,
-                          gt_bboxes_3d,
-                          gt_labels_3d,
-                          img_metas,
-                          gt_bboxes_ignore=None):
-        """Forward function for point cloud branch.
+    def _forward(self):
+        raise NotImplementedError(f'tensor mode is yet to add')
+
+    # original forward_train
+    def loss(self, batch_inputs_dict: Dict[List, Tensor],
+             batch_data_samples: List[Det3DDataSample],
+             **kwargs) -> List[Det3DDataSample]:
+        """
         Args:
-            pts_feats (list[torch.Tensor]): Features of point cloud branch
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes for each sample.
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
-                boxes of each sampole
-            img_metas (list[dict]): Meta information of samples.
-            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
-                boxes to be ignored. Defaults to None.
+            batch_inputs_dict (dict): The model input dict which include
+                `imgs` keys.
+                - imgs (torch.Tensor): Tensor of batched multi-view  images, has shape (B, N, C, H ,W)
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data Samples. It usually includes information such as `gt_instance_3d`, .
+
         Returns:
-            dict: Losses of each branch.
+            dict[str, Tensor]: A dictionary of loss components.
+
         """
-        outs = self.pts_bbox_head(pts_feats, img_metas)
-        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
-        losses = self.pts_bbox_head.loss(*loss_inputs)
-        return losses
-
-    @force_fp32(apply_to=('img', 'points'))
-    def forward(self, return_loss=True, **kwargs):
-        """Calls either forward_train or forward_test depending on whether
-        return_loss=True.
-        Note this setting will change the expected inputs. When
-        `return_loss=True`, img and img_metas are single-nested (i.e.
-        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
-        img_metas should be double nested (i.e.  list[torch.Tensor],
-        list[list[dict]]), with the outer list indicating test time
-        augmentations.
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        batch_input_metas = self.add_lidar2img(batch_input_metas)
+        img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
+        outs = self.pts_bbox_head(img_feats, batch_input_metas, **kwargs)
+
+        batch_gt_instances_3d = [
+            item.gt_instances_3d for item in batch_data_samples
+        ]
+        loss_inputs = [batch_gt_instances_3d, outs]
+        losses_pts = self.pts_bbox_head.loss_by_feat(*loss_inputs)
+
+        return losses_pts
+
+    # original simple_test
+    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[Det3DDataSample]:
+        """Forward of testing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `imgs` keys.
+
+                - imgs (torch.Tensor): Tensor of batched multi-view images, has shape (B, N, C, H ,W)
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input sample. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
+                contains a tensor with shape (num_instances, 9).
         """
-        if return_loss:
-            return self.forward_train(**kwargs)
-        else:
-            return self.forward_test(**kwargs)
-
-    def forward_train(self,
-                      points=None,
-                      img_metas=None,
-                      gt_bboxes_3d=None,
-                      gt_labels_3d=None,
-                      gt_labels=None,
-                      gt_bboxes=None,
-                      img=None,
-                      proposals=None,
-                      gt_bboxes_ignore=None,
-                      img_depth=None,
-                      img_mask=None):
-        """Forward training function.
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        batch_input_metas = self.add_lidar2img(batch_input_metas)
+        img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
+        outs = self.pts_bbox_head(img_feats, batch_input_metas)
+
+        results_list_3d = self.pts_bbox_head.predict_by_feat(
+            outs, batch_input_metas, **kwargs)
+        detsamples = self.add_pred_to_datasample(batch_data_samples,
+                                                 results_list_3d)
+        return detsamples
+
+    # may need speed-up
+    def add_lidar2img(self, batch_input_metas: List[Dict]) -> List[Dict]:
+        """add 'lidar2img' transformation matrix into batch_input_metas.
+
         Args:
-            points (list[torch.Tensor], optional): Points of each sample.
-                Defaults to None.
-            img_metas (list[dict], optional): Meta information of each sample.
-                Defaults to None.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
-                Ground truth 3D boxes. Defaults to None.
-            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
-                of 3D boxes. Defaults to None.
-            gt_labels (list[torch.Tensor], optional): Ground truth labels
-                of 2D boxes in images. Defaults to None.
-            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
-                images. Defaults to None.
-            img (torch.Tensor optional): Images of each sample with shape
-                (N, C, H, W). Defaults to None.
-            proposals ([list[torch.Tensor], optional): Predicted proposals
-                used for training Fast RCNN. Defaults to None.
-            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
-                2D boxes in images to be ignored. Defaults to None.
+            batch_input_metas (list[dict]): Meta information of multiple inputs
+                in a batch.
+
         Returns:
-            dict: Losses of different branches.
+            batch_input_metas (list[dict]): Meta info with lidar2img added
         """
-        img_feats = self.extract_feat(img=img, img_metas=img_metas)
-        losses = dict()
-        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
-                                            gt_labels_3d, img_metas,
-                                            gt_bboxes_ignore)
-        losses.update(losses_pts)
-        return losses
-    
-    def forward_test(self, img_metas, img=None, **kwargs):
-        for var, name in [(img_metas, 'img_metas')]:
-            if not isinstance(var, list):
-                raise TypeError('{} must be a list, but got {}'.format(
-                    name, type(var)))
-        img = [img] if img is None else img
-        return self.simple_test(img_metas[0], img[0], **kwargs)
-        # if num_augs == 1:
-        #     img = [img] if img is None else img
-        #     return self.simple_test(None, img_metas[0], img[0], **kwargs)
-        # else:
-        #     return self.aug_test(None, img_metas, img, **kwargs)
-
-    def simple_test_pts(self, x, img_metas, rescale=False):
-        """Test function of point cloud branch."""
-        outs = self.pts_bbox_head(x, img_metas)
-        bbox_list = self.pts_bbox_head.get_bboxes(
-            outs, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-    
-    def simple_test(self, img_metas, img=None, rescale=False):
-        """Test function without augmentaiton."""
-        img_feats = self.extract_feat(img=img, img_metas=img_metas)
-
-        bbox_list = [dict() for i in range(len(img_metas))]
-        bbox_pts = self.simple_test_pts(
-            img_feats, img_metas, rescale=rescale)
-        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
-            result_dict['pts_bbox'] = pts_bbox
-        return bbox_list
-
-    def aug_test_pts(self, feats, img_metas, rescale=False):
-        feats_list = []
-        for j in range(len(feats[0])):
-            feats_list_level = []
-            for i in range(len(feats)):
-                feats_list_level.append(feats[i][j])
-            feats_list.append(torch.stack(feats_list_level, -1).mean(-1))
-        outs = self.pts_bbox_head(feats_list, img_metas)
-        bbox_list = self.pts_bbox_head.get_bboxes(
-            outs, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def aug_test(self, img_metas, imgs=None, rescale=False):
-        """Test function with augmentaiton."""
-        img_feats = self.extract_feats(img_metas, imgs)
-        img_metas = img_metas[0]
-        bbox_list = [dict() for i in range(len(img_metas))]
-        bbox_pts = self.aug_test_pts(img_feats, img_metas, rescale)
-        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
-            result_dict['pts_bbox'] = pts_bbox
-        return bbox_list
+        for meta in batch_input_metas:
+            l2i = list()
+            for i in range(len(meta['cam2img'])):
+                c2i = torch.tensor(meta['cam2img'][i]).double()
+                l2c = torch.tensor(meta['lidar2cam'][i]).double()
+                l2i.append(get_lidar2img(c2i, l2c).float().numpy())
+            meta['lidar2img'] = l2i
+        return batch_input_metas
+
+
+#https://github.com/open-mmlab/mmdetection3d/pull/2110
+update_info_BUG_FIX = True
+
+
+def get_lidar2img(cam2img, lidar2cam):
+    """Get the projection matrix of lidar2img.
+
+    Args:
+        cam2img (torch.Tensor): A 3x3 or 4x4 projection matrix.
+        lidar2cam (torch.Tensor): A 3x3 or 4x4 projection matrix.
+
+    Returns:
+        torch.Tensor: transformation matrix with shape 4x4.
+    """
+    if update_info_BUG_FIX == False:
+        lidar2cam_r = lidar2cam[:3, :3]
+        lidar2cam_t = lidar2cam[:3, 3]
+        lidar2cam_t = torch.matmul(lidar2cam_t, lidar2cam_r.T)
+        lidar2cam[:3, 3] = lidar2cam_t
+    if cam2img.shape == (3, 3):
+        temp = cam2img.new_zeros(4, 4)
+        temp[:3, :3] = cam2img
+        temp[3, 3] = 1
+        cam2img = temp
+
+    if lidar2cam.shape == (3, 3):
+        temp = lidar2cam.new_zeros(4, 4)
+        temp[:3, :3] = lidar2cam
+        temp[3, 3] = 1
+        lidar2cam = temp
+    return torch.matmul(cam2img, lidar2cam)
diff --git a/projects/mmdet3d_plugin/models/detectors/detr3d_old.py b/projects/mmdet3d_plugin/models/detectors/detr3d_old.py
new file mode 100644
index 0000000..f5d30bb
--- /dev/null
+++ b/projects/mmdet3d_plugin/models/detectors/detr3d_old.py
@@ -0,0 +1,40 @@
+from typing import Dict, List, Optional, Sequence
+
+import numpy as np
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from torch import Tensor
+
+from .detr3d import Detr3D
+
+
+@MODELS.register_module()
+class Detr3D_old(Detr3D):
+    """Detr3D for old models trained earlier than mmdet3d-1.0.0."""
+
+    def __init__(self, **kawrgs):
+        super().__init__(**kawrgs)
+
+    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[Det3DDataSample]:
+
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        batch_input_metas = self.add_lidar2img(batch_input_metas)
+        img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
+        outs = self.pts_bbox_head(img_feats, batch_input_metas)
+
+        results_list_3d = self.pts_bbox_head.predict_by_feat(
+            outs, batch_input_metas, **kwargs)
+
+        # change the bboxes' format
+        for item in results_list_3d:
+            #cx, cy, cz, w, l, h, rot, vx, vy
+            item.bboxes_3d.tensor[..., [3, 4]] = item.bboxes_3d.tensor[...,
+                                                                       [4, 3]]
+            item.bboxes_3d.tensor[
+                ..., 6] = -item.bboxes_3d.tensor[..., 6] - np.pi / 2
+
+        detsamples = self.add_pred_to_datasample(batch_data_samples,
+                                                 results_list_3d)
+        return detsamples
diff --git a/projects/mmdet3d_plugin/models/detectors/obj_dgcnn.py b/projects/mmdet3d_plugin/models/detectors/obj_dgcnn.py
deleted file mode 100755
index 16fa0de..0000000
--- a/projects/mmdet3d_plugin/models/detectors/obj_dgcnn.py
+++ /dev/null
@@ -1,187 +0,0 @@
-import torch
-
-from mmdet.models import DETECTORS
-from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
-from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
-
-
-@DETECTORS.register_module()
-class ObjDGCNN(MVXTwoStageDetector):
-    """Base class of Multi-modality VoxelNet."""
-
-    def __init__(self,
-                 pts_voxel_layer=None,
-                 pts_voxel_encoder=None,
-                 pts_middle_encoder=None,
-                 pts_fusion_layer=None,
-                 img_backbone=None,
-                 pts_backbone=None,
-                 img_neck=None,
-                 pts_neck=None,
-                 pts_bbox_head=None,
-                 img_roi_head=None,
-                 img_rpn_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None):
-        super(ObjDGCNN,
-              self).__init__(pts_voxel_layer, pts_voxel_encoder,
-                             pts_middle_encoder, pts_fusion_layer,
-                             img_backbone, pts_backbone, img_neck, pts_neck,
-                             pts_bbox_head, img_roi_head, img_rpn_head,
-                             train_cfg, test_cfg, pretrained)
-
-    def extract_pts_feat(self, pts, img_feats, img_metas):
-        """Extract features of points."""
-        if not self.with_pts_bbox:
-            return None
-
-        voxels, num_points, coors = self.voxelize(pts)
-
-        voxel_features = self.pts_voxel_encoder(voxels, num_points, coors)
-        batch_size = coors[-1, 0] + 1
-        x = self.pts_middle_encoder(voxel_features, coors, batch_size)
-        x = self.pts_backbone(x)
-        if self.with_pts_neck:
-            x = self.pts_neck(x)
-        return x
-
-    def forward_pts_train(self,
-                          pts_feats,
-                          gt_bboxes_3d,
-                          gt_labels_3d,
-                          img_metas,
-                          gt_bboxes_ignore=None):
-        """Forward function for point cloud branch.
-        Args:
-            pts_feats (list[torch.Tensor]): Features of point cloud branch
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes for each sample.
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
-                boxes of each sampole
-            img_metas (list[dict]): Meta information of samples.
-            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
-                boxes to be ignored. Defaults to None.
-        Returns:
-            dict: Losses of each branch.
-        """
-        outs = self.pts_bbox_head(pts_feats)
-        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
-        losses = self.pts_bbox_head.loss(*loss_inputs)
-        return losses
-
-    def simple_test_pts(self, x, img_metas, rescale=False):
-        """Test function of point cloud branch."""
-        outs = self.pts_bbox_head(x)
-        bbox_list = self.pts_bbox_head.get_bboxes(
-            outs, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def aug_test_pts(self, feats, img_metas, rescale=False):
-        """Test function of point cloud branch with augmentaiton.
-        The function implementation process is as follows:
-            - step 1: map features back for double-flip augmentation.
-            - step 2: merge all features and generate boxes.
-            - step 3: map boxes back for scale augmentation.
-            - step 4: merge results.
-        Args:
-            feats (list[torch.Tensor]): Feature of point cloud.
-            img_metas (list[dict]): Meta information of samples.
-            rescale (bool): Whether to rescale bboxes. Default: False.
-        Returns:
-            dict: Returned bboxes consists of the following keys:
-                - boxes_3d (:obj:`LiDARInstance3DBoxes`): Predicted bboxes.
-                - scores_3d (torch.Tensor): Scores of predicted boxes.
-                - labels_3d (torch.Tensor): Labels of predicted boxes.
-        """
-        # only support aug_test for one sample
-        outs_list = []
-        for x, img_meta in zip(feats, img_metas):
-            outs = self.pts_bbox_head(x[0])
-            # merge augmented outputs before decoding bboxes
-            for task_id, out in enumerate(outs):
-                for key in out[0].keys():
-                    if img_meta[0]['pcd_horizontal_flip']:
-                        outs[task_id][0][key] = torch.flip(
-                            outs[task_id][0][key], dims=[2])
-                        if key == 'reg':
-                            outs[task_id][0][key][:, 1, ...] = 1 - outs[
-                                task_id][0][key][:, 1, ...]
-                        elif key == 'rot':
-                            outs[task_id][0][
-                                key][:, 1,
-                                     ...] = -outs[task_id][0][key][:, 1, ...]
-                        elif key == 'vel':
-                            outs[task_id][0][
-                                key][:, 1,
-                                     ...] = -outs[task_id][0][key][:, 1, ...]
-                    if img_meta[0]['pcd_vertical_flip']:
-                        outs[task_id][0][key] = torch.flip(
-                            outs[task_id][0][key], dims=[3])
-                        if key == 'reg':
-                            outs[task_id][0][key][:, 0, ...] = 1 - outs[
-                                task_id][0][key][:, 0, ...]
-                        elif key == 'rot':
-                            outs[task_id][0][
-                                key][:, 0,
-                                     ...] = -outs[task_id][0][key][:, 0, ...]
-                        elif key == 'vel':
-                            outs[task_id][0][
-                                key][:, 0,
-                                     ...] = -outs[task_id][0][key][:, 0, ...]
-
-            outs_list.append(outs)
-
-        preds_dicts = dict()
-        scale_img_metas = []
-
-        # concat outputs sharing the same pcd_scale_factor
-        for i, (img_meta, outs) in enumerate(zip(img_metas, outs_list)):
-            pcd_scale_factor = img_meta[0]['pcd_scale_factor']
-            if pcd_scale_factor not in preds_dicts.keys():
-                preds_dicts[pcd_scale_factor] = outs
-                scale_img_metas.append(img_meta)
-            else:
-                for task_id, out in enumerate(outs):
-                    for key in out[0].keys():
-                        preds_dicts[pcd_scale_factor][task_id][0][key] += out[
-                            0][key]
-
-        aug_bboxes = []
-
-        for pcd_scale_factor, preds_dict in preds_dicts.items():
-            for task_id, pred_dict in enumerate(preds_dict):
-                # merge outputs with different flips before decoding bboxes
-                for key in pred_dict[0].keys():
-                    preds_dict[task_id][0][key] /= len(outs_list) / len(
-                        preds_dicts.keys())
-            bbox_list = self.pts_bbox_head.get_bboxes(
-                preds_dict, img_metas[0], rescale=rescale)
-            bbox_list = [
-                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
-                for bboxes, scores, labels in bbox_list
-            ]
-            aug_bboxes.append(bbox_list[0])
-
-        if len(preds_dicts.keys()) > 1:
-            # merge outputs with different scales after decoding bboxes
-            merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, scale_img_metas,
-                                                self.pts_bbox_head.test_cfg)
-            return merged_bboxes
-        else:
-            for key in bbox_list[0].keys():
-                bbox_list[0][key] = bbox_list[0][key].to('cpu')
-            return bbox_list[0]
-
-    def aug_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test function with augmentaiton."""
-        img_feats, pts_feats = self.extract_feats(points, img_metas, imgs)
-        bbox_list = dict()
-        if pts_feats and self.with_pts_bbox:
-            pts_bbox = self.aug_test_pts(pts_feats, img_metas, rescale)
-            bbox_list.update(pts_bbox=pts_bbox)
-        return [bbox_list]
\ No newline at end of file
diff --git a/projects/mmdet3d_plugin/models/task_modules/__init__.py b/projects/mmdet3d_plugin/models/task_modules/__init__.py
new file mode 100644
index 0000000..82e994e
--- /dev/null
+++ b/projects/mmdet3d_plugin/models/task_modules/__init__.py
@@ -0,0 +1,3 @@
+from .hungarian_assigner_3d import HungarianAssigner3D
+from .match_cost import BBox3DL1Cost
+from .nms_free_coder import NMSFreeCoder
diff --git a/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py b/projects/mmdet3d_plugin/models/task_modules/hungarian_assigner_3d.py
similarity index 63%
rename from projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py
rename to projects/mmdet3d_plugin/models/task_modules/hungarian_assigner_3d.py
index ccb4253..f5f8167 100755
--- a/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py
+++ b/projects/mmdet3d_plugin/models/task_modules/hungarian_assigner_3d.py
@@ -1,11 +1,14 @@
+from typing import List
+
 import torch
+from mmdet3d.registry import TASK_UTILS
+from mmdet.models.task_modules.assigners import AssignResult  # check
+from mmdet.models.task_modules.assigners import BaseAssigner
+from mmengine.structures import InstanceData
+from torch import Tensor
 
-from mmdet.core.bbox.builder import BBOX_ASSIGNERS
-from mmdet.core.bbox.assigners import AssignResult
-from mmdet.core.bbox.assigners import BaseAssigner
-from mmdet.core.bbox.match_costs import build_match_cost
-from mmdet.models.utils.transformer import inverse_sigmoid
-from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
+# from mmdet.core.bbox.match_costs import build_match_cost
+from .util import normalize_bbox
 
 try:
     from scipy.optimize import linear_sum_assignment
@@ -13,49 +16,46 @@
     linear_sum_assignment = None
 
 
-@BBOX_ASSIGNERS.register_module()
+@TASK_UTILS.register_module()
 class HungarianAssigner3D(BaseAssigner):
     """Computes one-to-one matching between predictions and ground truth.
+
     This class computes an assignment between the targets and the predictions
-    based on the costs. The costs are weighted sum of three components:
-    classification cost, regression L1 cost and regression iou cost. The
-    targets don't include the no_object, so generally there are more
-    predictions than targets. After the one-to-one matching, the un-matched
-    are treated as backgrounds. Thus each query prediction will be assigned
-    with `0` or a positive integer indicating the ground truth index:
+    based on the costs. The costs are weighted sum of some components.
+    For DETR3D the costs are weighted sum of classification cost, regression L1
+    cost and regression iou cost. The targets don't include the no_object, so
+    generally there are more predictions than targets. After the one-to-one
+    matching, the un-matched are treated as backgrounds. Thus each query
+    prediction will be assigned with `0` or a positive integer indicating the
+    ground truth index:
+
     - 0: negative sample, no assigned gt
     - positive integer: positive sample, index (1-based) of assigned gt
+
     Args:
-        cls_weight (int | float, optional): The scale factor for classification
-            cost. Default 1.0.
-        bbox_weight (int | float, optional): The scale factor for regression
-            L1 cost. Default 1.0.
-        iou_weight (int | float, optional): The scale factor for regression
-            iou cost. Default 1.0.
-        iou_calculator (dict | optional): The config for the iou calculation.
-            Default type `BboxOverlaps2D`.
-        iou_mode (str | optional): "iou" (intersection over union), "iof"
-                (intersection over foreground), or "giou" (generalized
-                intersection over union). Default "giou".
+        cls_cost (obj:`ConfigDict`) : Match cost configs.
+        reg_cost.
+        iou_cost.
+        pc_range: perception range of the detector
     """
 
     def __init__(self,
                  cls_cost=dict(type='ClassificationCost', weight=1.),
                  reg_cost=dict(type='BBoxL1Cost', weight=1.0),
                  iou_cost=dict(type='IoUCost', weight=0.0),
-                 pc_range=None):
-        self.cls_cost = build_match_cost(cls_cost)
-        self.reg_cost = build_match_cost(reg_cost)
-        self.iou_cost = build_match_cost(iou_cost)
+                 pc_range: List = None):
+        self.cls_cost = TASK_UTILS.build(cls_cost)
+        self.reg_cost = TASK_UTILS.build(reg_cost)
+        self.iou_cost = TASK_UTILS.build(iou_cost)
         self.pc_range = pc_range
 
     def assign(self,
-               bbox_pred,
-               cls_pred,
-               gt_bboxes, 
-               gt_labels,
+               bbox_pred: Tensor,
+               cls_pred: Tensor,
+               gt_bboxes: Tensor,
+               gt_labels: Tensor,
                gt_bboxes_ignore=None,
-               eps=1e-7):
+               eps=1e-7) -> AssignResult:
         """Computes one-to-one matching based on the weighted costs.
         This method assign each query prediction to a ground truth or
         background. The `assigned_gt_inds` with -1 means don't care,
@@ -70,23 +70,22 @@ def assign(self,
            and assign the corresponding gt index (plus 1) to it.
         Args:
             bbox_pred (Tensor): Predicted boxes with normalized coordinates
-                (cx, cy, w, h), which are all in range [0, 1]. Shape
-                [num_query, 4].
+                (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) which are all in range [0, 1].
+                Shape [num_query, 10].
             cls_pred (Tensor): Predicted classification logits, shape
                 [num_query, num_class].
             gt_bboxes (Tensor): Ground truth boxes with unnormalized
-                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+                coordinates (cx,cy,cz,l,w,h,φ,v_x,v_y). Shape [num_gt, 9].
             gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
             gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
                 labelled as `ignored`. Default None.
-            eps (int | float, optional): A value added to the denominator for
-                numerical stability. Default 1e-7.
+            eps (int | float, optional): unused parameter
         Returns:
             :obj:`AssignResult`: The assigned result.
         """
         assert gt_bboxes_ignore is None, \
             'Only case when gt_bboxes_ignore is None is supported.'
-        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)  #9, 900
 
         # 1. assign -1 by default
         assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
@@ -100,19 +99,24 @@ def assign(self,
             if num_gts == 0:
                 # No ground truth, assign all to background
                 assigned_gt_inds[:] = 0
-            return AssignResult(
-                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+            return AssignResult(num_gts,
+                                assigned_gt_inds,
+                                None,
+                                labels=assigned_labels)
 
         # 2. compute the weighted costs
         # classification and bboxcost.
-        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # # dev1.x interface alignment
+        pred_instances = InstanceData(scores=cls_pred)
+        gt_instances = InstanceData(labels=gt_labels)
+        cls_cost = self.cls_cost(pred_instances, gt_instances)
         # regression L1 cost
         normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
         reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
-      
+
         # weighted sum of above two costs
         cost = cls_cost + reg_cost
-        
+
         # 3. do Hungarian matching on CPU using linear_sum_assignment
         cost = cost.detach().cpu()
         if linear_sum_assignment is None:
@@ -130,5 +134,7 @@ def assign(self,
         # assign foregrounds based on matching results
         assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
         assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
-        return AssignResult(
-            num_gts, assigned_gt_inds, None, labels=assigned_labels)
\ No newline at end of file
+        return AssignResult(num_gts,
+                            assigned_gt_inds,
+                            None,
+                            labels=assigned_labels)
diff --git a/projects/mmdet3d_plugin/models/task_modules/match_cost.py b/projects/mmdet3d_plugin/models/task_modules/match_cost.py
new file mode 100755
index 0000000..89152f1
--- /dev/null
+++ b/projects/mmdet3d_plugin/models/task_modules/match_cost.py
@@ -0,0 +1,32 @@
+from typing import List, Optional, Union
+
+import torch
+from mmdet3d.registry import TASK_UTILS
+from torch import Tensor
+
+
+@TASK_UTILS.register_module()
+class BBox3DL1Cost(object):
+    """BBox3DL1Cost.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self, weight: Union[float, int] = 1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred: Tensor, gt_bboxes: Tensor) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) which are all in range [0, 1].
+                Shape [num_query, 10].
+            gt_bboxes (Tensor): Ground truth boxes with `normalized`
+                coordinates (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y). Shape [num_gt, 10].
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
diff --git a/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py b/projects/mmdet3d_plugin/models/task_modules/nms_free_coder.py
similarity index 73%
rename from projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py
rename to projects/mmdet3d_plugin/models/task_modules/nms_free_coder.py
index 11eafc3..8cbe3ac 100755
--- a/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py
+++ b/projects/mmdet3d_plugin/models/task_modules/nms_free_coder.py
@@ -1,13 +1,14 @@
 import torch
+from mmdet3d.registry import TASK_UTILS
+from mmdet.models.task_modules import BaseBBoxCoder
 
-from mmdet.core.bbox import BaseBBoxCoder
-from mmdet.core.bbox.builder import BBOX_CODERS
-from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox
+from .util import denormalize_bbox
 
 
-@BBOX_CODERS.register_module()
+@TASK_UTILS.register_module()
 class NMSFreeCoder(BaseBBoxCoder):
     """Bbox coder for NMS-free detector.
+
     Args:
         pc_range (list[float]): Range of point cloud.
         post_center_range (list[float]): Limit of the center.
@@ -15,17 +16,16 @@ class NMSFreeCoder(BaseBBoxCoder):
         max_num (int): Max number to be kept. Default: 100.
         score_threshold (float): Threshold to filter boxes based on score.
             Default: None.
-        code_size (int): Code size of bboxes. Default: 9
     """
 
     def __init__(self,
-                 pc_range,
+                 pc_range=None,
                  voxel_size=None,
                  post_center_range=None,
                  max_num=100,
                  score_threshold=None,
                  num_classes=10):
-        
+
         self.pc_range = pc_range
         self.voxel_size = voxel_size
         self.post_center_range = post_center_range
@@ -38,34 +38,36 @@ def encode(self):
 
     def decode_single(self, cls_scores, bbox_preds):
         """Decode bboxes.
+
         Args:
             cls_scores (Tensor): Outputs from the classification head, \
                 shape [num_query, cls_out_channels]. Note \
                 cls_out_channels should includes background.
             bbox_preds (Tensor): Outputs from the regression \
-                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
-                Shape [num_query, 9].
+                head with normalized coordinate format (cx, cy, l, w, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 10].
         Returns:
             list[dict]: Decoded boxes.
         """
         max_num = self.max_num
 
         cls_scores = cls_scores.sigmoid()
-        scores, indexs = cls_scores.view(-1).topk(max_num)
-        labels = indexs % self.num_classes
-        bbox_index = indexs // self.num_classes
+        scores, indexes = cls_scores.view(-1).topk(max_num)
+        labels = indexes % self.num_classes
+        bbox_index = indexes // self.num_classes
         bbox_preds = bbox_preds[bbox_index]
 
-        final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)   
-        final_scores = scores 
-        final_preds = labels 
+        # [[cx, cy, cz, l, w, h, rot, vx, vy]]
+        final_box_preds = denormalize_bbox(bbox_preds, None)
+        final_scores = scores
+        final_preds = labels
 
         # use score threshold
         if self.score_threshold is not None:
             thresh_mask = final_scores > self.score_threshold
         if self.post_center_range is not None:
-            self.post_center_range = torch.tensor(
-                self.post_center_range, device=scores.device)
+            self.post_center_range = torch.tensor(self.post_center_range,
+                                                  device=scores.device)
             mask = (final_box_preds[..., :3] >=
                     self.post_center_range[:3]).all(1)
             mask &= (final_box_preds[..., :3] <=
@@ -91,21 +93,24 @@ def decode_single(self, cls_scores, bbox_preds):
 
     def decode(self, preds_dicts):
         """Decode bboxes.
+
         Args:
             all_cls_scores (Tensor): Outputs from the classification head, \
                 shape [nb_dec, bs, num_query, cls_out_channels]. Note \
                 cls_out_channels should includes background.
             all_bbox_preds (Tensor): Sigmoid outputs from the regression \
-                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
-                Shape [nb_dec, bs, num_query, 9].
+                head with normalized coordinate format (cx, cy, l, w, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 10].
         Returns:
             list[dict]: Decoded boxes.
         """
-        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_cls_scores = preds_dicts['all_cls_scores'][
+            -1]  #cls & reg target of last decoder layer
         all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
-        
+
         batch_size = all_cls_scores.size()[0]
         predictions_list = []
         for i in range(batch_size):
-            predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
-        return predictions_list
\ No newline at end of file
+            predictions_list.append(
+                self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+        return predictions_list
diff --git a/projects/mmdet3d_plugin/models/task_modules/util.py b/projects/mmdet3d_plugin/models/task_modules/util.py
new file mode 100755
index 0000000..ea1e51b
--- /dev/null
+++ b/projects/mmdet3d_plugin/models/task_modules/util.py
@@ -0,0 +1,76 @@
+from typing import List
+
+import torch
+from torch import Tensor
+
+
+def normalize_bbox(bboxes: Tensor, pc_range: List) -> Tensor:
+    """ normalize bboxes
+        Args:
+            bboxes (Tensor): boxes with unnormalized
+                coordinates (cx,cy,cz,l,w,h,φ,v_x,v_y). Shape [num_gt, 9].
+            pc_range (List): Perception range of the detector
+        Returns:
+            normalized_bboxes (Tensor): boxes with normalized coordinates \
+            (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y), which are all in range [0, 1]. \
+            Shape [num_query, 10].
+    """
+
+    cx = bboxes[..., 0:1]
+    cy = bboxes[..., 1:2]
+    cz = bboxes[..., 2:3]
+    w = bboxes[..., 3:4].log()
+    l = bboxes[..., 4:5].log()
+    h = bboxes[..., 5:6].log()
+
+    rot = bboxes[..., 6:7]
+    if bboxes.size(-1) > 7:
+        vx = bboxes[..., 7:8]
+        vy = bboxes[..., 8:9]
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1)
+    else:
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1)
+    return normalized_bboxes
+
+
+def denormalize_bbox(normalized_bboxes, pc_range):
+    """ denormalize bboxes
+        Args:
+            normalized_bboxes (Tensor): boxes with normalized coordinates \
+            (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y), which are all in range [0, 1]. \
+            Shape [num_query, 10].
+            pc_range (List): Perception range of the detector
+        Returns:
+            denormalized_bboxes (Tensor): boxes with unnormalized
+                coordinates (cx,cy,cz,l,w,h,φ,v_x,v_y). Shape [num_gt, 9].
+    """
+    # rotation
+    rot_sine = normalized_bboxes[..., 6:7]
+
+    rot_cosine = normalized_bboxes[..., 7:8]
+    rot = torch.atan2(rot_sine, rot_cosine)
+
+    # center in the bev
+    cx = normalized_bboxes[..., 0:1]
+    cy = normalized_bboxes[..., 1:2]
+    cz = normalized_bboxes[..., 4:5]
+
+    # size, the meaning of w,l may alter in different version of mmdet3d
+    w = normalized_bboxes[..., 2:3]
+    l = normalized_bboxes[..., 3:4]
+    h = normalized_bboxes[..., 5:6]
+
+    w = w.exp()
+    l = l.exp()
+    h = h.exp()
+    if normalized_bboxes.size(-1) > 8:
+        # velocity
+        vx = normalized_bboxes[:, 8:9]
+        vy = normalized_bboxes[:, 9:10]
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy],
+                                        dim=-1)
+    else:
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
+    return denormalized_bboxes
diff --git a/projects/mmdet3d_plugin/models/utils/__init__.py b/projects/mmdet3d_plugin/models/utils/__init__.py
index 797ac7f..54bdea5 100755
--- a/projects/mmdet3d_plugin/models/utils/__init__.py
+++ b/projects/mmdet3d_plugin/models/utils/__init__.py
@@ -1,6 +1,2 @@
-from .dgcnn_attn import DGCNNAttn
-from .detr import Deformable3DDetrTransformerDecoder
-from .detr3d_transformer import Detr3DTransformer, Detr3DTransformerDecoder, Detr3DCrossAtten
-
-__all__ = ['DGCNNAttn', 'Deformable3DDetrTransformerDecoder', 
-           'Detr3DTransformer', 'Detr3DTransformerDecoder', 'Detr3DCrossAtten']
+from .detr3d_transformer import (Detr3DCrossAtten, Detr3DTransformer,
+                                 Detr3DTransformerDecoder)
diff --git a/projects/mmdet3d_plugin/models/utils/detr.py b/projects/mmdet3d_plugin/models/utils/detr.py
deleted file mode 100755
index ff72d32..0000000
--- a/projects/mmdet3d_plugin/models/utils/detr.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import torch
-from mmcv.cnn.bricks.registry import TRANSFORMER_LAYER_SEQUENCE
-from mmcv.cnn.bricks.transformer import TransformerLayerSequence
-
-
-def inverse_sigmoid(x, eps=1e-5):
-    """Inverse function of sigmoid.
-    Args:
-        x (Tensor): The tensor to do the
-            inverse.
-        eps (float): EPS avoid numerical
-            overflow. Defaults 1e-5.
-    Returns:
-        Tensor: The x has passed the inverse
-            function of sigmoid, has same
-            shape with input.
-    """
-    x = x.clamp(min=0, max=1)
-    x1 = x.clamp(min=eps)
-    x2 = (1 - x).clamp(min=eps)
-    return torch.log(x1 / x2)
-    
-
-@TRANSFORMER_LAYER_SEQUENCE.register_module()
-class Deformable3DDetrTransformerDecoder(TransformerLayerSequence):
-    """Copy the decoder in DETR transformer.
-    Args:
-        return_intermediate (bool): Whether to return intermediate outputs.
-        coder_norm_cfg (dict): Config of last normalization layer. Default：
-            `LN`.
-    """
-
-    def __init__(self, *args, return_intermediate=False, **kwargs):
-        super(Deformable3DDetrTransformerDecoder, self).__init__(*args, **kwargs)
-        self.return_intermediate = return_intermediate
-
-    def forward(self,
-                query,
-                *args,
-                reference_points=None,
-                valid_ratios=None,
-                reg_branches=None,
-                **kwargs):
-        """Forward function for `TransformerDecoder`.
-        Args:
-            query (Tensor): Input query with shape
-                `(num_query, bs, embed_dims)`.
-            reference_points (Tensor): The reference
-                points of offset. has shape
-                (bs, num_query, 4) when as_two_stage,
-                otherwise has shape ((bs, num_query, 2).
-            valid_ratios (Tensor): The radios of valid
-                points on the feature map, has shape
-                (bs, num_levels, 2)
-            reg_branch: (obj:`nn.ModuleList`): Used for
-                refining the regression results. Only would
-                be passed when with_box_refine is True,
-                otherwise would be passed a `None`.
-        Returns:
-            Tensor: Results with shape [1, num_query, bs, embed_dims] when
-                return_intermediate is `False`, otherwise it has shape
-                [num_layers, num_query, bs, embed_dims].
-        """
-        output = query
-        intermediate = []
-        intermediate_reference_points = []
-        for lid, layer in enumerate(self.layers):
-            if reference_points.shape[-1] == 4:
-                reference_points_input = reference_points[:, :, None] * \
-                    torch.cat([valid_ratios, valid_ratios], -1)[:, None]
-            else:
-                assert reference_points.shape[-1] == 2
-                reference_points_input = reference_points[:, :, None] * \
-                    valid_ratios[:, None]
-            output = layer(
-                output,
-                *args,
-                reference_points=reference_points_input,
-                **kwargs)
-            output = output.permute(1, 0, 2)
-
-            if reg_branches is not None:
-                tmp = reg_branches[lid](output)
-                if reference_points.shape[-1] == 4:
-                    new_reference_points = tmp + inverse_sigmoid(
-                        reference_points)
-                    new_reference_points = new_reference_points.sigmoid()
-                else:
-                    assert reference_points.shape[-1] == 2
-                    # This is to deal with the different output number (10). 
-                    # new_reference_points = tmp
-                    new_reference_points = tmp[
-                        ..., :2] + inverse_sigmoid(reference_points)
-                    new_reference_points = new_reference_points.sigmoid()
-                reference_points = new_reference_points.detach()
-
-            output = output.permute(1, 0, 2)
-            if self.return_intermediate:
-                intermediate.append(output)
-                intermediate_reference_points.append(reference_points)
-
-        if self.return_intermediate:
-            return torch.stack(intermediate), torch.stack(
-                intermediate_reference_points)
-
-        return output, reference_points
diff --git a/projects/mmdet3d_plugin/models/utils/detr3d_transformer.py b/projects/mmdet3d_plugin/models/utils/detr3d_transformer.py
index 2a94e5c..b3cf281 100755
--- a/projects/mmdet3d_plugin/models/utils/detr3d_transformer.py
+++ b/projects/mmdet3d_plugin/models/utils/detr3d_transformer.py
@@ -1,21 +1,20 @@
+import time
+import warnings
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmcv.cnn import xavier_init, constant_init
-from mmcv.cnn.bricks.registry import (ATTENTION,
-                                      TRANSFORMER_LAYER_SEQUENCE)
-from mmcv.cnn.bricks.transformer import (MultiScaleDeformableAttention,
-                                         TransformerLayerSequence,
+from mmcv.cnn.bricks.transformer import (TransformerLayerSequence,
                                          build_transformer_layer_sequence)
-from mmcv.runner.base_module import BaseModule
-
-from mmdet.models.utils.builder import TRANSFORMER
+from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
+from mmdet3d.registry import MODELS
+from mmengine.model import BaseModule, constant_init, xavier_init
 
 
 def inverse_sigmoid(x, eps=1e-5):
     """Inverse function of sigmoid.
+
     Args:
         x (Tensor): The tensor to do the
             inverse.
@@ -32,14 +31,17 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
 
 
-@TRANSFORMER.register_module()
+@MODELS.register_module()
 class Detr3DTransformer(BaseModule):
     """Implements the Detr3D transformer.
+
     Args:
         as_two_stage (bool): Generate query from encoder features.
             Default: False.
         num_feature_levels (int): Number of feature maps from FPN:
             Default: 4.
+        num_cams (int): Number of cameras in the dataset.
+            Default: 6 in NuScenes Det.
         two_stage_num_proposals (int): Number of proposals when set
             `as_two_stage` as True. Default: 300.
     """
@@ -68,29 +70,24 @@ def init_weights(self):
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
         for m in self.modules():
-            if isinstance(m, MultiScaleDeformableAttention) or isinstance(m, Detr3DCrossAtten):
+            if isinstance(m, MultiScaleDeformableAttention) or isinstance(
+                    m, Detr3DCrossAtten):
                 m.init_weight()
         xavier_init(self.reference_points, distribution='uniform', bias=0.)
 
-    def forward(self,
-                mlvl_feats,
-                query_embed,
-                reg_branches=None,
-                **kwargs):
+    def forward(self, mlvl_feats, query_embed, reg_branches=None, **kwargs):
         """Forward function for `Detr3DTransformer`.
         Args:
             mlvl_feats (list(Tensor)): Input queries from
                 different level. Each element has shape
-                [bs, embed_dims, h, w].
-            query_embed (Tensor): The query embedding for decoder,
-                with shape [num_query, c].
+                (B, N, C, H_lvl, W_lvl).
+            query_embed (Tensor): The query positional and semantic embedding for decoder, with shape [num_query, c+c].
             mlvl_pos_embeds (list(Tensor)): The positional encoding
-                of feats from different level, has the shape
-                 [bs, embed_dims, h, w].
+                of feats from different level, has the shape [bs, N, embed_dims, h, w].
+                It is unused here.
             reg_branches (obj:`nn.ModuleList`): Regression heads for
                 feature maps from each decoder layer. Only would
-                be passed when
-                `with_box_refine` is True. Default to None.
+                be passed when `with_box_refine` is True. Default to None.
         Returns:
             tuple[Tensor]: results of decoder containing the following tensor.
                 - inter_states: Outputs from decoder. If
@@ -101,24 +98,13 @@ def forward(self,
                     points, has shape (bs, num_queries, 4).
                 - inter_references_out: The internal value of reference \
                     points in decoder, has shape \
-                    (num_dec_layers, bs,num_query, embed_dims)
-                - enc_outputs_class: The classification score of \
-                    proposals generated from \
-                    encoder's feature maps, has shape \
-                    (batch, h*w, num_classes). \
-                    Only would be returned when `as_two_stage` is True, \
-                    otherwise None.
-                - enc_outputs_coord_unact: The regression results \
-                    generated from encoder's feature maps., has shape \
-                    (batch, h*w, 4). Only would \
-                    be returned when `as_two_stage` is True, \
-                    otherwise None.
+                    (num_dec_layers, bs, num_query, embed_dims)
         """
         assert query_embed is not None
         bs = mlvl_feats[0].size(0)
-        query_pos, query = torch.split(query_embed, self.embed_dims , dim=1)
-        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
-        query = query.unsqueeze(0).expand(bs, -1, -1)
+        query_pos, query = torch.split(query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)  #[bs,num_q,c]
+        query = query.unsqueeze(0).expand(bs, -1, -1)  #[bs,num_q,c]
         reference_points = self.reference_points(query_pos)
         reference_points = reference_points.sigmoid()
         init_reference_out = reference_points
@@ -139,9 +125,10 @@ def forward(self,
         return inter_states, init_reference_out, inter_references_out
 
 
-@TRANSFORMER_LAYER_SEQUENCE.register_module()
+@MODELS.register_module()
 class Detr3DTransformerDecoder(TransformerLayerSequence):
     """Implements the decoder in DETR3D transformer.
+
     Args:
         return_intermediate (bool): Whether to return intermediate outputs.
         coder_norm_cfg (dict): Config of last normalization layer. Default：
@@ -165,7 +152,7 @@ def forward(self,
             reference_points (Tensor): The reference
                 points of offset. has shape
                 (bs, num_query, 4) when as_two_stage,
-                otherwise has shape ((bs, num_query, 2).
+                otherwise has shape self.reference_points = nn.Linear(self.embed_dims, 3)
             reg_branch: (obj:`nn.ModuleList`): Used for
                 refining the regression results. Only would
                 be passed when with_box_refine is True,
@@ -178,26 +165,24 @@ def forward(self,
         output = query
         intermediate = []
         intermediate_reference_points = []
-        for lid, layer in enumerate(self.layers):
+        for lid, layer in enumerate(self.layers):  ## iterative refinement
             reference_points_input = reference_points
-            output = layer(
-                output,
-                *args,
-                reference_points=reference_points_input,
-                **kwargs)
+            output = layer(output,
+                           *args,
+                           reference_points=reference_points_input,
+                           **kwargs)
             output = output.permute(1, 0, 2)
-
             if reg_branches is not None:
                 tmp = reg_branches[lid](output)
-                
+
                 assert reference_points.shape[-1] == 3
 
                 new_reference_points = torch.zeros_like(reference_points)
-                new_reference_points[..., :2] = tmp[
-                    ..., :2] + inverse_sigmoid(reference_points[..., :2])
-                new_reference_points[..., 2:3] = tmp[
-                    ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3])
-                
+                new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(
+                    reference_points[..., :2])
+                new_reference_points[...,
+                                     2:3] = tmp[..., 4:5] + inverse_sigmoid(
+                                         reference_points[..., 2:3])
                 new_reference_points = new_reference_points.sigmoid()
 
                 reference_points = new_reference_points.detach()
@@ -214,9 +199,10 @@ def forward(self,
         return output, reference_points
 
 
-@ATTENTION.register_module()
+@MODELS.register_module()
 class Detr3DCrossAtten(BaseModule):
-    """An attention module used in Detr3d. 
+    """An attention module used in Detr3d.
+
     Args:
         embed_dims (int): The embedding dimension of Attention.
             Default: 256.
@@ -233,18 +219,21 @@ class Detr3DCrossAtten(BaseModule):
             Default: None.
     """
 
-    def __init__(self,
-                 embed_dims=256,
-                 num_heads=8,
-                 num_levels=4,
-                 num_points=5,
-                 num_cams=6,
-                 im2col_step=64,
-                 pc_range=None,
-                 dropout=0.1,
-                 norm_cfg=None,
-                 init_cfg=None,
-                 batch_first=False):
+    def __init__(
+        self,
+        embed_dims=256,
+        num_heads=8,
+        num_levels=4,
+        num_points=5,
+        num_cams=6,
+        im2col_step=64,
+        pc_range=None,
+        dropout=0.1,
+        norm_cfg=None,
+        init_cfg=None,
+        batch_first=False,
+        #  waymo_with_nuscene=False
+    ):
         super(Detr3DCrossAtten, self).__init__(init_cfg)
         if embed_dims % num_heads != 0:
             raise ValueError(f'embed_dims must be divisible by num_heads, '
@@ -278,20 +267,20 @@ def _is_power_of_2(n):
         self.num_points = num_points
         self.num_cams = num_cams
         self.attention_weights = nn.Linear(embed_dims,
-                                           num_cams*num_levels*num_points)
+                                           num_cams * num_levels * num_points)
 
         self.output_proj = nn.Linear(embed_dims, embed_dims)
-      
+
         self.position_encoder = nn.Sequential(
-            nn.Linear(3, self.embed_dims), 
+            nn.Linear(3, self.embed_dims),
             nn.LayerNorm(self.embed_dims),
             nn.ReLU(inplace=True),
-            nn.Linear(self.embed_dims, self.embed_dims), 
+            nn.Linear(self.embed_dims, self.embed_dims),
             nn.LayerNorm(self.embed_dims),
             nn.ReLU(inplace=True),
         )
         self.batch_first = batch_first
-
+        # self.waymo_with_nuscene = waymo_with_nuscene
         self.init_weight()
 
     def init_weight(self):
@@ -305,44 +294,27 @@ def forward(self,
                 value,
                 residual=None,
                 query_pos=None,
-                key_padding_mask=None,
                 reference_points=None,
-                spatial_shapes=None,
-                level_start_index=None,
                 **kwargs):
         """Forward Function of Detr3DCrossAtten.
+
         Args:
             query (Tensor): Query of Transformer with shape
                 (num_query, bs, embed_dims).
             key (Tensor): The key tensor with shape
                 `(num_key, bs, embed_dims)`.
-            value (Tensor): The value tensor with shape
-                `(num_key, bs, embed_dims)`. (B, N, C, H, W)
+            value (List[Tensor]): Image features from
+                different level. Each element has shape
+                (B, N, C, H_lvl, W_lvl).
             residual (Tensor): The tensor used for addition, with the
                 same shape as `x`. Default None. If None, `x` will be used.
             query_pos (Tensor): The positional encoding for `query`.
                 Default: None.
-            key_pos (Tensor): The positional encoding for `key`. Default
-                None.
-            reference_points (Tensor):  The normalized reference
-                points with shape (bs, num_query, 4),
-                all elements is range in [0, 1], top-left (0,0),
-                bottom-right (1, 1), including padding area.
-                or (N, Length_{query}, num_levels, 4), add
-                additional two dimensions is (w, h) to
-                form reference boxes.
-            key_padding_mask (Tensor): ByteTensor for `query`, with
-                shape [bs, num_key].
-            spatial_shapes (Tensor): Spatial shape of features in
-                different level. With shape  (num_levels, 2),
-                last dimension represent (h, w).
-            level_start_index (Tensor): The start index of each level.
-                A tensor has shape (num_levels) and can be represented
-                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            reference_points (Tensor): The normalized 3D reference
+                points with shape (bs, num_query, 3)
         Returns:
              Tensor: forwarded results with shape [num_query, bs, embed_dims].
         """
-
         if key is None:
             key = query
         if value is None:
@@ -353,70 +325,119 @@ def forward(self,
         if query_pos is not None:
             query = query + query_pos
 
-        # change to (bs, num_query, embed_dims)
         query = query.permute(1, 0, 2)
 
         bs, num_query, _ = query.size()
 
         attention_weights = self.attention_weights(query).view(
             bs, 1, num_query, self.num_cams, self.num_points, self.num_levels)
-        
         reference_points_3d, output, mask = feature_sampling(
             value, reference_points, self.pc_range, kwargs['img_metas'])
         output = torch.nan_to_num(output)
         mask = torch.nan_to_num(mask)
-
+        # if self.waymo_with_nuscene == True:
+        #     num_view = mask.shape[3]
+        #     attention_weights = attention_weights[:, :, :, :num_view, ...]
         attention_weights = attention_weights.sigmoid() * mask
         output = output * attention_weights
         output = output.sum(-1).sum(-1).sum(-1)
         output = output.permute(2, 0, 1)
-        
-        output = self.output_proj(output)
         # (num_query, bs, embed_dims)
-        pos_feat = self.position_encoder(inverse_sigmoid(reference_points_3d)).permute(1, 0, 2)
-
+        output = self.output_proj(output)
+        pos_feat = self.position_encoder(
+            inverse_sigmoid(reference_points_3d)).permute(1, 0, 2)
         return self.dropout(output) + inp_residual + pos_feat
 
 
-def feature_sampling(mlvl_feats, reference_points, pc_range, img_metas):
-    lidar2img = []
-    for img_meta in img_metas:
-        lidar2img.append(img_meta['lidar2img'])
+def feature_sampling(
+    mlvl_feats,
+    ref_pt,
+    pc_range,
+    img_metas,
+    #  return_depth=False
+):
+    """ sample multi-level features by projecting 3D reference points to 2D image
+        Args:
+            mlvl_feats (List[Tensor]): Image features from
+                different level. Each element has shape
+                (B, N, C, H_lvl, W_lvl).
+            ref_pt (Tensor): The normalized 3D reference
+                points with shape (bs, num_query, 3)
+            pc_range: perception range of the detector
+            img_metas (list[dict]): Meta information of multiple inputs
+                in a batch, containing `lidar2img`.
+        Returns:
+            ref_pt_3d (Tensor): A copy of original ref_pt
+            sampled_feats (Tensor): sampled features with shape \
+                (B C num_q N 1 fpn_lvl)
+            mask (Tensor): Determine whether the reference point \
+                has projected outsied of images, with shape \
+                (B 1 num_q N 1 1)
+    """
+    lidar2img = [meta['lidar2img'] for meta in img_metas]
     lidar2img = np.asarray(lidar2img)
-    lidar2img = reference_points.new_tensor(lidar2img) # (B, N, 4, 4)
-    reference_points = reference_points.clone()
-    reference_points_3d = reference_points.clone()
-    reference_points[..., 0:1] = reference_points[..., 0:1]*(pc_range[3] - pc_range[0]) + pc_range[0]
-    reference_points[..., 1:2] = reference_points[..., 1:2]*(pc_range[4] - pc_range[1]) + pc_range[1]
-    reference_points[..., 2:3] = reference_points[..., 2:3]*(pc_range[5] - pc_range[2]) + pc_range[2]
-    # reference_points (B, num_queries, 4)
-    reference_points = torch.cat((reference_points, torch.ones_like(reference_points[..., :1])), -1)
-    B, num_query = reference_points.size()[:2]
+    lidar2img = ref_pt.new_tensor(lidar2img)
+    ref_pt = ref_pt.clone()
+    ref_pt_3d = ref_pt.clone()
+
+    B, num_query = ref_pt.size()[:2]
     num_cam = lidar2img.size(1)
-    reference_points = reference_points.view(B, 1, num_query, 4).repeat(1, num_cam, 1, 1).unsqueeze(-1)
-    lidar2img = lidar2img.view(B, num_cam, 1, 4, 4).repeat(1, 1, num_query, 1, 1)
-    reference_points_cam = torch.matmul(lidar2img, reference_points).squeeze(-1)
     eps = 1e-5
-    mask = (reference_points_cam[..., 2:3] > eps)
-    reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum(
-        reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3])*eps)
-    reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1]
-    reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0]
-    reference_points_cam = (reference_points_cam - 0.5) * 2
-    mask = (mask & (reference_points_cam[..., 0:1] > -1.0) 
-                 & (reference_points_cam[..., 0:1] < 1.0) 
-                 & (reference_points_cam[..., 1:2] > -1.0) 
-                 & (reference_points_cam[..., 1:2] < 1.0))
+
+    ref_pt[..., 0:1] = \
+        ref_pt[..., 0:1] * (pc_range[3] - pc_range[0]) + pc_range[0]  #x
+    ref_pt[..., 1:2] = \
+        ref_pt[..., 1:2] * (pc_range[4] - pc_range[1]) + pc_range[1]  #y
+    ref_pt[..., 2:3] = \
+        ref_pt[..., 2:3] * (pc_range[5] - pc_range[2]) + pc_range[2]  #z
+
+    #(B num_q 3) -> (B num_q 4) -> (B 1 num_q 4) -> (B num_cam num_q 4 1)
+    ref_pt = torch.cat((ref_pt, torch.ones_like(ref_pt[..., :1])), -1)
+    ref_pt = ref_pt.view(B, 1, num_query, 4)
+    ref_pt = ref_pt.repeat(1, num_cam, 1, 1).unsqueeze(-1)
+    #(B num_cam 4 4) -> (B num_cam num_q 4 4)
+    lidar2img = lidar2img.view(B, num_cam, 1, 4, 4)\
+                         .repeat(1, 1, num_query, 1, 1)
+    #(... 4 4) * (... 4 1) -> (B num_cam num_q 4)
+    pt_cam = torch.matmul(lidar2img, ref_pt).squeeze(-1)
+
+    # (B num_cam num_q)
+    z = pt_cam[..., 2:3]
+    eps = eps * torch.ones_like(z)
+    mask = (z > eps)
+    pt_cam = pt_cam[..., 0:2] / torch.maximum(z, eps)  # prevent zero-division
+    #padded nuscene image: 928*1600
+    (h, w) = img_metas[0]['pad_shape']
+    pt_cam[..., 0] /= w
+    pt_cam[..., 1] /= h
+    # else:
+    # (h,w,_) = img_metas[0]['ori_shape'][0]          #waymo image
+    # pt_cam[..., 0] /= w                             #cam0~2: 1280*1920
+    # pt_cam[..., 1] /= h                             #cam3~4: 886 *1920 padded to 1280*1920
+    # mask[:, 3:5, :] &= (pt_cam[:, 3:5, :, 1:2] < 0.7) #filter pt_cam_y > 886
+
+    mask = (mask & (pt_cam[..., 0:1] > 0.0)
+            & (pt_cam[..., 0:1] < 1.0)
+            & (pt_cam[..., 1:2] > 0.0)
+            & (pt_cam[..., 1:2] < 1.0))
+    #(B num_cam num_q) -> (B 1 num_q num_cam 1 1)
     mask = mask.view(B, num_cam, 1, num_query, 1, 1).permute(0, 2, 3, 1, 4, 5)
     mask = torch.nan_to_num(mask)
+
+    pt_cam = (pt_cam - 0.5) * 2  #[0,1] to [-1,1] to do grid_sample
     sampled_feats = []
     for lvl, feat in enumerate(mlvl_feats):
         B, N, C, H, W = feat.size()
-        feat = feat.view(B*N, C, H, W)
-        reference_points_cam_lvl = reference_points_cam.view(B*N, num_query, 1, 2)
-        sampled_feat = F.grid_sample(feat, reference_points_cam_lvl)
-        sampled_feat = sampled_feat.view(B, N, C, num_query, 1).permute(0, 2, 3, 1, 4)
+        feat = feat.view(B * N, C, H, W)
+        pt_cam_lvl = pt_cam.view(B * N, num_query, 1, 2)
+        sampled_feat = F.grid_sample(feat, pt_cam_lvl)
+        #(B num_cam C num_query 1) -> List of (B C num_q num_cam 1)
+        sampled_feat = sampled_feat.view(B, N, C, num_query, 1)
+        sampled_feat = sampled_feat.permute(0, 2, 3, 1, 4)
         sampled_feats.append(sampled_feat)
+
     sampled_feats = torch.stack(sampled_feats, -1)
-    sampled_feats = sampled_feats.view(B, C, num_query, num_cam,  1, len(mlvl_feats))
-    return reference_points_3d, sampled_feats, mask
+    # (B C num_q num_cam fpn_lvl)
+    sampled_feats = \
+        sampled_feats.view(B, C, num_query, num_cam, 1, len(mlvl_feats))
+    return ref_pt_3d, sampled_feats, mask
diff --git a/projects/mmdet3d_plugin/models/utils/dgcnn_attn.py b/projects/mmdet3d_plugin/models/utils/dgcnn_attn.py
deleted file mode 100755
index 5cfd459..0000000
--- a/projects/mmdet3d_plugin/models/utils/dgcnn_attn.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-from mmcv.cnn.bricks.registry import ATTENTION
-from mmcv.runner.base_module import BaseModule
-
-
-@ATTENTION.register_module()
-class DGCNNAttn(BaseModule):
-    """A warpper for DGCNN-type self-attention.
-    Args:
-        embed_dims (int): The embedding dimension.
-        num_heads (int): Parallel attention heads. Same as
-            `nn.MultiheadAttention`.
-        dropout (float):w A Dropout layer on attn_output_weights. Default: 0..
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 dropout=0.,
-                 init_cfg=None,
-                 **kwargs):
-        super(DGCNNAttn, self).__init__(init_cfg)
-        self.embed_dims = embed_dims
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.conv1 = nn.Sequential(nn.Conv2d(self.embed_dims*2, self.embed_dims, kernel_size=1, bias=False),
-                                   nn.BatchNorm2d(self.embed_dims),
-                                   nn.ReLU(inplace=True))
-        self.conv2 = nn.Sequential(nn.Conv2d(self.embed_dims*2, self.embed_dims, kernel_size=1, bias=False),
-                                   nn.BatchNorm2d(self.embed_dims),
-                                   nn.ReLU(inplace=True))
-        self.K = kwargs['K']
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self,
-                query,
-                key=None,
-                value=None,
-                residual=None,
-                query_pos=None,
-                key_pos=None,
-                attn_mask=None,
-                key_padding_mask=None,
-                **kwargs):
-        """Forward function for `DGCNN`.
-        **kwargs allow passing a more general data flow when combining
-        with other operations in `DGCNN`.
-        Args:
-            query (Tensor): The input query with shape [num_queries, bs,
-                embed_dims]. Same in `nn.MultiheadAttention.forward`.
-            residual (Tensor): This tensor, with the same shape as x,
-                will be used for the residual link.
-                If None, `x` will be used. Defaults to None.
-            query_pos (Tensor): The positional encoding for query, with
-                the same shape as `x`. If not None, it will
-                be added to `x` before forward function. Defaults to None.
-        Returns:
-            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
-        """
-        if residual is None:
-            residual = query
-        if query_pos is not None:
-            query = query + query_pos
-        
-        query = query.permute(1, 0, 2) # [bs, num_queries, embed_dims]
-        edge_feats = self.edge_feats(query, K=self.K)
-        edge_feats1 = self.conv1(edge_feats)
-        edge_feats1 = edge_feats1.max(dim=-1)[0]
-        out = edge_feats1
-        edge_feats1 = self.edge_feats(edge_feats1.permute(0, 2, 1))
-        edge_feats2 = self.conv2(edge_feats1)
-        edge_feats2 = edge_feats2.max(dim=-1)[0]
-        out = out + edge_feats2
-        out = out.permute(2, 0, 1)
-        return residual + self.dropout(out)
-
-    def edge_feats(self, query, K=16):
-        # (B, N, N)
-        affinity = torch.cdist(query, query)
-        # (B, N, K)
-        _, topk = torch.topk(affinity, k=K, dim=2)
-        B, N, C = query.size()
-
-        idx_base = torch.arange(0, B, device=query.device).view(-1, 1, 1) * N
-        idx = topk + idx_base
-        idx = idx.view(-1)
-        query = query.reshape(B*N, C)
-        query_neighbor = query[idx, :].view(B, N, K, C)
-        query = query.reshape(B, N, 1, C).repeat(1, 1, K, 1)
-        out = torch.cat((query_neighbor, query), dim=-1).permute(0, 3, 1, 2).contiguous()
-        return out
diff --git a/projects/mmdet3d_plugin/models/utils/grid_mask.py b/projects/mmdet3d_plugin/models/utils/grid_mask.py
index fe05ca8..f57bb50 100755
--- a/projects/mmdet3d_plugin/models/utils/grid_mask.py
+++ b/projects/mmdet3d_plugin/models/utils/grid_mask.py
@@ -1,16 +1,25 @@
+import numpy as np
 import torch
 import torch.nn as nn
-import numpy as np
 from PIL import Image
 
+
 class Grid(object):
-    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
+
+    def __init__(self,
+                 use_h,
+                 use_w,
+                 rotate=1,
+                 offset=False,
+                 ratio=0.5,
+                 mode=0,
+                 prob=1.):
         self.use_h = use_h
         self.use_w = use_w
         self.rotate = rotate
         self.offset = offset
         self.ratio = ratio
-        self.mode=mode
+        self.mode = mode
         self.st_prob = prob
         self.prob = prob
 
@@ -24,50 +33,59 @@ def __call__(self, img, label):
         w = img.size(2)
         self.d1 = 2
         self.d2 = min(h, w)
-        hh = int(1.5*h)
-        ww = int(1.5*w)
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
         d = np.random.randint(self.d1, self.d2)
         if self.ratio == 1:
             self.l = np.random.randint(1, d)
         else:
-            self.l = min(max(int(d*self.ratio+0.5),1),d-1)
+            self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
         mask = np.ones((hh, ww), np.float32)
         st_h = np.random.randint(d)
         st_w = np.random.randint(d)
         if self.use_h:
-            for i in range(hh//d):
-                s = d*i + st_h
-                t = min(s+self.l, hh)
-                mask[s:t,:] *= 0
+            for i in range(hh // d):
+                s = d * i + st_h
+                t = min(s + self.l, hh)
+                mask[s:t, :] *= 0
         if self.use_w:
-            for i in range(ww//d):
-                s = d*i + st_w
-                t = min(s+self.l, ww)
-                mask[:,s:t] *= 0
-       
+            for i in range(ww // d):
+                s = d * i + st_w
+                t = min(s + self.l, ww)
+                mask[:, s:t] *= 0
+
         r = np.random.randint(self.rotate)
         mask = Image.fromarray(np.uint8(mask))
         mask = mask.rotate(r)
         mask = np.asarray(mask)
-        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h,
+                    (ww - w) // 2:(ww - w) // 2 + w]
 
         mask = torch.from_numpy(mask).float()
         if self.mode == 1:
-            mask = 1-mask
+            mask = 1 - mask
 
         mask = mask.expand_as(img)
         if self.offset:
-            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float()
+            offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float()
             offset = (1 - mask) * offset
             img = img * mask + offset
         else:
-            img = img * mask 
+            img = img * mask
 
         return img, label
 
 
 class GridMask(nn.Module):
-    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
+
+    def __init__(self,
+                 use_h,
+                 use_w,
+                 rotate=1,
+                 offset=False,
+                 ratio=0.5,
+                 mode=0,
+                 prob=1.):
         super(GridMask, self).__init__()
         self.use_h = use_h
         self.use_w = use_w
@@ -79,45 +97,47 @@ def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0,
         self.prob = prob
 
     def set_prob(self, epoch, max_epoch):
-        self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5
+        self.prob = self.st_prob * epoch / max_epoch  #+ 1.#0.5
 
     def forward(self, x):
         if np.random.rand() > self.prob or not self.training:
             return x
-        n,c,h,w = x.size()
-        x = x.view(-1,h,w)
-        hh = int(1.5*h)
-        ww = int(1.5*w)
+        n, c, h, w = x.size()
+        x = x.view(-1, h, w)
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
         d = np.random.randint(2, h)
-        self.l = min(max(int(d*self.ratio+0.5),1),d-1)
+        self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
         mask = np.ones((hh, ww), np.float32)
         st_h = np.random.randint(d)
         st_w = np.random.randint(d)
         if self.use_h:
-            for i in range(hh//d):
-                s = d*i + st_h
-                t = min(s+self.l, hh)
-                mask[s:t,:] *= 0
+            for i in range(hh // d):
+                s = d * i + st_h
+                t = min(s + self.l, hh)
+                mask[s:t, :] *= 0
         if self.use_w:
-            for i in range(ww//d):
-                s = d*i + st_w
-                t = min(s+self.l, ww)
-                mask[:,s:t] *= 0
-       
+            for i in range(ww // d):
+                s = d * i + st_w
+                t = min(s + self.l, ww)
+                mask[:, s:t] *= 0
+
         r = np.random.randint(self.rotate)
         mask = Image.fromarray(np.uint8(mask))
         mask = mask.rotate(r)
         mask = np.asarray(mask)
-        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h,
+                    (ww - w) // 2:(ww - w) // 2 + w]
 
         mask = torch.from_numpy(mask).float().cuda()
         if self.mode == 1:
-            mask = 1-mask
+            mask = 1 - mask
         mask = mask.expand_as(x)
         if self.offset:
-            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float().cuda()
+            offset = torch.from_numpy(
+                2 * (np.random.rand(h, w) - 0.5)).float().cuda()
             x = x * mask + offset * (1 - mask)
         else:
-            x = x * mask 
+            x = x * mask
 
-        return x.view(n,c,h,w)
\ No newline at end of file
+        return x.view(n, c, h, w)

From 470d9a6aa83765635c3b259a12495c6ed63c3536 Mon Sep 17 00:00:00 2001
From: zltjohn <1163842861@qq.com>
Date: Thu, 22 Dec 2022 11:12:15 +0800
Subject: [PATCH 2/2] correct 'config' to 'configs'

---
 .../detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py               | 0
 projects/{config => configs}/detr3d_res101_gridmask_dev-1.x.py    | 0
 .../detr3d_res101_gridmask_dev-1.x_test_old.py                    | 0
 ...3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename projects/{config => configs}/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py (100%)
 rename projects/{config => configs}/detr3d_res101_gridmask_dev-1.x.py (100%)
 rename projects/{config => configs}/detr3d_res101_gridmask_dev-1.x_test_old.py (100%)
 rename projects/{config => configs}/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py (100%)

diff --git a/projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py b/projects/configs/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py
similarity index 100%
rename from projects/config/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py
rename to projects/configs/detr3d_res101_gridmask_cbgs_dev-1.x_test_old.py
diff --git a/projects/config/detr3d_res101_gridmask_dev-1.x.py b/projects/configs/detr3d_res101_gridmask_dev-1.x.py
similarity index 100%
rename from projects/config/detr3d_res101_gridmask_dev-1.x.py
rename to projects/configs/detr3d_res101_gridmask_dev-1.x.py
diff --git a/projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py b/projects/configs/detr3d_res101_gridmask_dev-1.x_test_old.py
similarity index 100%
rename from projects/config/detr3d_res101_gridmask_dev-1.x_test_old.py
rename to projects/configs/detr3d_res101_gridmask_dev-1.x_test_old.py
diff --git a/projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py b/projects/configs/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py
similarity index 100%
rename from projects/config/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py
rename to projects/configs/detr3d_vovnet_gridmask_det_final_trainval_cbgs_dev-1.x_test_old.py