diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/base.py b/autoware_ml/configs/detection3d/dataset/t4dataset/base.py
index 3be587072..7f4be6293 100644
--- a/autoware_ml/configs/detection3d/dataset/t4dataset/base.py
+++ b/autoware_ml/configs/detection3d/dataset/t4dataset/base.py
@@ -29,6 +29,9 @@
     "db_j6gen2_v7",
     "db_j6gen2_v8",
     "db_j6gen2_v9",
+    "db_j6gen2_v10",
+    "db_j6gen2_v11",
+    "db_j6gen2_v12",
     "db_largebus_v1",
     "db_largebus_v2",
     "db_largebus_v3",
@@ -146,6 +149,8 @@
     "traffic_cone": "traffic_cone",
     "trafficcone": "traffic_cone",
     "barrier": "barrier",
+    "other_vehicle": "car",
+    "other_pedestrian": "pedestrian",
 }
 
 class_names = ["car", "truck", "bus", "bicycle", "pedestrian", "traffic_cone", "barrier"]
diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2.py b/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2.py
index 0324e7207..a87166019 100644
--- a/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2.py
+++ b/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2.py
@@ -27,6 +27,9 @@
     "db_j6gen2_v7",
     "db_j6gen2_v8",
     "db_j6gen2_v9",
+    "db_j6gen2_v10",
+    "db_j6gen2_v11",
+    "db_j6gen2_v12",
 ]
 
 dataset_test_groups = {
@@ -127,6 +130,8 @@
     "traffic_cone": "traffic_cone",
     "trafficcone": "traffic_cone",
     "barrier": "barrier",
+    "other_vehicle": "car",
+    "other_pedestrian": "pedestrian",
 }
 
 class_names = [
diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py b/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py
index b9ec03f27..ef0141a5b 100644
--- a/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py
+++ b/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py
@@ -27,6 +27,9 @@
     "db_j6gen2_v7",
     "db_j6gen2_v8",
     "db_j6gen2_v9",
+    "db_j6gen2_v10",
+    "db_j6gen2_v11",
+    "db_j6gen2_v12",
     "db_largebus_v1",
     "db_largebus_v2",
     "db_largebus_v3",
@@ -133,6 +136,8 @@
     "traffic_cone": "traffic_cone",
     "trafficcone": "traffic_cone",
     "barrier": "barrier",
+    "other_vehicle": "car",
+    "other_pedestrian": "pedestrian",
 }
 
 
diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py b/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py
index c08decfa1..0f00a651d 100644
--- a/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py
+++ b/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py
@@ -123,6 +123,8 @@
     "traffic_cone": "traffic_cone",
     "trafficcone": "traffic_cone",
     "barrier": "barrier",
+    "other_vehicle": "car",
+    "other_pedestrian": "pedestrian",
 }
 
 class_names = [
diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_gen2.py b/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_gen2.py
index dbd6e2813..9995cd9b7 100644
--- a/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_gen2.py
+++ b/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_gen2.py
@@ -120,6 +120,8 @@
     "traffic_cone": "traffic_cone",
     "trafficcone": "traffic_cone",
     "barrier": "barrier",
+    "other_vehicle": "car",
+    "other_pedestrian": "pedestrian",
 }
 
 class_names = [
diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/largebus.py b/autoware_ml/configs/detection3d/dataset/t4dataset/largebus.py
index 2212b8e56..cd42362b5 100644
--- a/autoware_ml/configs/detection3d/dataset/t4dataset/largebus.py
+++ b/autoware_ml/configs/detection3d/dataset/t4dataset/largebus.py
@@ -122,6 +122,8 @@
     "traffic_cone": "traffic_cone",
     "trafficcone": "traffic_cone",
     "barrier": "barrier",
+    "other_vehicle": "car",
+    "other_pedestrian": "pedestrian",
 }
 
 class_names = [
diff --git a/autoware_ml/configs/detection3d/default_runtime.py b/autoware_ml/configs/detection3d/default_runtime.py
index cc2b896f7..6da761425 100644
--- a/autoware_ml/configs/detection3d/default_runtime.py
+++ b/autoware_ml/configs/detection3d/default_runtime.py
@@ -2,9 +2,17 @@
 
 default_hooks = dict(
     timer=dict(type="IterTimerHook"),
-    logger=dict(type="LoggerHook", interval=50),
+    logger=dict(
+        type="LoggerHook",
+        interval=50,
+        backend_args=dict(backend="local"),
+    ),
     param_scheduler=dict(type="ParamSchedulerHook"),
-    checkpoint=dict(type="CheckpointHook", interval=-1),
+    checkpoint=dict(
+        type="CheckpointHook",
+        interval=-1,
+        backend_args=dict(backend="local"),
+    ),
     sampler_seed=dict(type="DistSamplerSeedHook"),
     visualization=dict(type="Det3DVisualizationHook"),
 )
diff --git a/autoware_ml/detection3d/datasets/t4dataset.py b/autoware_ml/detection3d/datasets/t4dataset.py
index ce1c78f31..38c5f69dd 100644
--- a/autoware_ml/detection3d/datasets/t4dataset.py
+++ b/autoware_ml/detection3d/datasets/t4dataset.py
@@ -2,6 +2,7 @@
 from typing import List
 
 import numpy as np
+import tqdm
 from mmdet3d.datasets import NuScenesDataset
 from mmengine.logging import print_log
 from mmengine.registry import DATASETS
@@ -56,16 +57,14 @@ def filter_data(self) -> List[dict]:
             return self.data_list
 
         filtered_data_list = []
-        for entry in self.data_list:
+        for entry in tqdm.tqdm(self.data_list, desc="Filtering data"):
             filtered = False
             for camera_order in filter_frames_with_camera_order:
                 if camera_order not in entry["images"]:
                     filtered = True
                     break
 
-                if entry["images"][camera_order]["img_path"] is None or not osp.exists(
-                    entry["images"][camera_order]["img_path"]
-                ):
+                if entry["images"][camera_order]["img_path"] is None:
                     filtered = True
                     break
 
@@ -180,6 +179,7 @@ def parse_data_info(self, info: dict) -> dict:
                             cam_prefix,
                             img_info["img_path"],
                         )
+                    # print_log(f"Camera path: {img_info['img_path']}", logger="current")
 
             if self.default_cam_key is not None:
                 info["img_path"] = info["images"][self.default_cam_key]["img_path"]
@@ -192,4 +192,7 @@ def parse_data_info(self, info: dict) -> dict:
                 else:
                     info["lidar2img"] = info["cam2img"] @ info["lidar2cam"]
 
+        # Default difficulty to 0 if not present
+        if "difficulty" not in info:
+            info["difficulty"] = 0
         return info
diff --git a/autoware_ml/detection3d/datasets/transforms/__init__.py b/autoware_ml/detection3d/datasets/transforms/__init__.py
index 6bc932f1a..a63ff1eea 100644
--- a/autoware_ml/detection3d/datasets/transforms/__init__.py
+++ b/autoware_ml/detection3d/datasets/transforms/__init__.py
@@ -1,3 +1,4 @@
+from .local_3d_bbox import Local3DBBoxExpand
 from .object_min_points_filter import ObjectMinPointsFilter
 
-__all__ = ["ObjectMinPointsFilter"]
+__all__ = ["ObjectMinPointsFilter", "Local3DBBoxExpand"]
diff --git a/autoware_ml/detection3d/datasets/transforms/local_3d_bbox.py b/autoware_ml/detection3d/datasets/transforms/local_3d_bbox.py
new file mode 100644
index 000000000..ae06d4005
--- /dev/null
+++ b/autoware_ml/detection3d/datasets/transforms/local_3d_bbox.py
@@ -0,0 +1,58 @@
+from typing import List
+
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmdet3d.structures.ops import box_np_ops
+from mmengine.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class Local3DBBoxExpand(BaseTransform):
+    """Locally expand the 3D bounding boxes by scaling the width, which it doesn't scale the points.
+
+    Args:
+        expand_widths: (List[float]): Uniformly sampled expand width.
+        width_dim: (int): The dimension of the width. Default is 4, which is the width dimension of the 3D
+                  bounding box. Since 3D Bbox is in the format of [x, y, z, dx, dy, dz, heading], the width dimension is the
+                    4th dimension.
+            label_ids: (List[int]): The label IDs to expand. If None, all label IDs will be expanded.
+    """
+
+    def __init__(self, expand_widths: List[float], width_dim: int = 4, label_ids: List[int] = None) -> None:
+        assert isinstance(expand_widths, list)
+        assert len(expand_widths) == 2
+        assert expand_widths[0] < expand_widths[1]
+        self.expand_widths = expand_widths
+        self.width_dim = width_dim
+        self.label_ids = label_ids
+
+    def transform(self, input_dict: dict) -> dict:
+        """Call function to locally augment the 3D bounding boxes by scaling the width.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after locally augmenting the 3D bounding boxes by scaling the width, 'gt_bboxes_3d' \
+                key is updated in the result dict.
+        """
+        # Label mask
+        if self.label_ids is not None:
+            label_masks = [True if label in self.label_ids else False for label in input_dict["gt_labels_3d"]]
+        else:
+            label_masks = np.ones(len(input_dict["gt_labels_3d"]), dtype=bool)
+
+        for i in range(len(input_dict["gt_bboxes_3d"])):
+            if not label_masks[i]:
+                continue
+
+            expand_width = np.random.uniform(self.expand_widths[0], self.expand_widths[1])
+            input_dict["gt_bboxes_3d"].tensor[i, self.width_dim] += expand_width
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f"(expand_widths={self.expand_widths}, width_dim={self.width_dim}, label_ids={self.label_ids})"
+        return repr_str
diff --git a/projects/BEVFusion/bevfusion/__init__.py b/projects/BEVFusion/bevfusion/__init__.py
index 60a64b532..c4afb6a86 100644
--- a/projects/BEVFusion/bevfusion/__init__.py
+++ b/projects/BEVFusion/bevfusion/__init__.py
@@ -1,11 +1,19 @@
 from .bevfusion import BEVFusion
 from .bevfusion_head import BEVFusionHead, ConvFuser
 from .bevfusion_necks import GeneralizedLSSFPN
+from .bevfusion_voxel_encoder import BEVFusionVoxelFeatureNet, HardSimpleVoxelSinCosEncoder
 from .depth_lss import DepthLSSTransform, LSSTransform
-from .loading import BEVLoadMultiViewImageFromFiles
+from .depth_lss_v2 import LSSTransformV2, LSSTransformV2DepthAware
+from .loading import BEVLoadMultiViewImageFromFiles, PointsToMultiViewImageDepths
 from .sparse_encoder import BEVFusionSparseEncoder
 from .transformer import TransformerDecoderLayer
-from .transforms_3d import BEVFusionGlobalRotScaleTrans, BEVFusionRandomFlip3D, GridMask, ImageAug3D
+from .transforms_3d import (
+    BEVFusionGlobalRotScaleTrans,
+    BEVFusionRandomFlip3D,
+    BEVFusionRemoveLiDARPoints,
+    GridMask,
+    ImageAug3D,
+)
 from .utils import BBoxBEVL1Cost, HeuristicAssigner3D, HungarianAssigner3D, IoU3DCost, TransFusionBBoxCoder
 
 __all__ = [
@@ -26,5 +34,11 @@
     "TransformerDecoderLayer",
     "BEVFusionRandomFlip3D",
     "BEVFusionGlobalRotScaleTrans",
+    "BEVFusionRemoveLiDARPoints",
     "TransFusionBBoxCoder",
+    "HardSimpleVoxelSinCosEncoder",
+    "BEVFusionVoxelFeatureNet",
+    "LSSTransformV2",
+    "PointsToMultiViewImageDepths",
+    "LSSTransformV2DepthAware",
 ]
diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index 243b3beb5..2eb3ef3d3 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -1,7 +1,10 @@
+import math
 from collections import OrderedDict
 from copy import deepcopy
+from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
+import matplotlib.pyplot as plt
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -9,6 +12,7 @@
 from mmdet3d.registry import MODELS
 from mmdet3d.structures import Det3DDataSample
 from mmdet3d.utils import OptConfigType, OptMultiConfig, OptSampleList
+from mmengine.logging import print_log
 from mmengine.utils import is_list_of
 from torch import Tensor
 from torch.nn import functional as F
@@ -34,6 +38,9 @@ def __init__(
         bbox_head: Optional[dict] = None,
         init_cfg: OptMultiConfig = None,
         seg_head: Optional[dict] = None,
+        loss_depth_weight: float = 3.0,
+        depth_gt_downsample: int = 1,
+        visualize_gt_depth_dir: Optional[str] = None,
         **kwargs,
     ) -> None:
         """Initialize BEVFusion model.
@@ -56,12 +63,10 @@ def __init__(
         super().__init__(data_preprocessor=data_preprocessor, init_cfg=init_cfg)
 
         if voxelize_cfg is not None:
-            self.voxelize_reduce = voxelize_cfg.pop("voxelize_reduce")
             self.pts_voxel_layer = Voxelization(**voxelize_cfg)
             self.pts_voxel_encoder = MODELS.build(pts_voxel_encoder)
             self.pts_middle_encoder = MODELS.build(pts_middle_encoder)
         else:
-            self.voxelize_reduce = False
             self.pts_voxel_layer = None
             self.pts_voxel_encoder = None
             self.pts_middle_encoder = None
@@ -76,8 +81,12 @@ def __init__(
         self.pts_neck = MODELS.build(pts_neck) if pts_neck is not None else None
 
         self.bbox_head = MODELS.build(bbox_head)
-
-        self.init_weights()
+        self._weights_initialized = False
+        self.loss_depth_weight = loss_depth_weight
+        self.depth_gt_downsample = depth_gt_downsample
+        self.visualize_gt_depth_dir = Path(visualize_gt_depth_dir) if visualize_gt_depth_dir is not None else None
+        if self.visualize_gt_depth_dir is not None:
+            self.visualize_gt_depth_dir.mkdir(parents=True, exist_ok=True)
 
     def _forward(
         self, batch_inputs_dict: Tensor, batch_data_samples: OptSampleList = [], using_image_features=False, **kwargs
@@ -133,8 +142,11 @@ def parse_losses(self, losses: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, D
         return loss, log_vars  # type: ignore
 
     def init_weights(self) -> None:
+        if self._weights_initialized:
+            return
         if self.img_backbone is not None:
             self.img_backbone.init_weights()
+        self._weights_initialized = True
 
     @property
     def with_bbox_head(self):
@@ -146,6 +158,53 @@ def with_seg_head(self):
         """bool: Whether the detector has a segmentation head."""
         return hasattr(self, "seg_head") and self.seg_head is not None
 
+    def prepare_camera_depth_aware_parameters(
+        self,
+        camera_intrinsics: torch.Tensor,
+        img_aug_matrix: torch.Tensor,
+        lidar_aug_matrix: torch.Tensor,
+        camera2lidar: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            camera_intrinsics: torch.Tensor, the camera intrinsics of shape (B, N, 3, 3).
+            img_aug_matrix: torch.Tensor, the image augmentation matrix of shape (B, N, 4, 4).
+            lidar_aug_matrix: torch.Tensor, the lidar augmentation matrix of shape (B, 4, 4).
+            camera2lidar: torch.Tensor, the camera to lidar matrix of shape (B, N, 4, 4).
+        Returns:
+            torch.Tensor, the camera depth aware parameters of shape (B*N, N_CAMERA_DEPTH_PARAMETERS).
+        """
+        B, N, _, _ = camera_intrinsics.shape
+        lidar_aug_matrix = lidar_aug_matrix.view(B, 1, 4, 4).repeat(1, N, 1, 1)
+
+        # (B*N, 15)
+        mlp_input = torch.stack(
+            [
+                camera_intrinsics[:, :, 0, 0],  # fx
+                camera_intrinsics[:, :, 1, 1],  # fy
+                camera_intrinsics[:, :, 0, 2],  # cx
+                camera_intrinsics[:, :, 1, 2],  # cy
+                img_aug_matrix[:, :, 0, 0],  # r11
+                img_aug_matrix[:, :, 0, 1],  # r12
+                img_aug_matrix[:, :, 0, 3],  # t1
+                img_aug_matrix[:, :, 1, 0],  # r21
+                img_aug_matrix[:, :, 1, 1],  # r22
+                img_aug_matrix[:, :, 1, 3],  # t2
+                lidar_aug_matrix[:, :, 0, 0],  # r11
+                lidar_aug_matrix[:, :, 0, 1],  # r12
+                lidar_aug_matrix[:, :, 1, 0],  # r21
+                lidar_aug_matrix[:, :, 1, 1],  # r22
+                lidar_aug_matrix[:, :, 2, 2],  # r33
+            ],
+            dim=-1,
+        )
+        # (B, N, 4, 4) -> (B, N, 3, 4) -> (B*N, 12)
+        camera2lidar_flatten = camera2lidar[:, :, :3, :].view(B, N, -1)
+
+        # (B, N, 15+12)
+        mlp_input = torch.cat([mlp_input, camera2lidar_flatten], dim=-1)
+        return mlp_input
+
     def get_image_backbone_features(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C, H, W = x.size()
         x = x.view(B * N, C, H, W).contiguous()
@@ -176,14 +235,15 @@ def extract_img_feat(
         lidar_aug_matrix_inverse=None,
         geom_feats=None,
         using_image_features=False,
-    ) -> torch.Tensor:
+        camera_depth_aware_parameters=None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
 
         if not using_image_features:
             x = self.get_image_backbone_features(x)
 
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast("cuda", enabled=False):
             # with torch.autocast(device_type='cuda', dtype=torch.float32):
-            x = self.view_transform(
+            x, pred_depths = self.view_transform(
                 x,
                 points,
                 lidar2image,
@@ -196,29 +256,32 @@ def extract_img_feat(
                 img_aug_matrix_inverse,
                 lidar_aug_matrix_inverse,
                 geom_feats,
+                camera_depth_aware_parameters=camera_depth_aware_parameters,
             )
-        return x
+        return x, pred_depths
 
     def extract_pts_feat(self, feats, coords, sizes, points=None) -> torch.Tensor:
         if points is not None:
             # NOTE(knzo25): training and normal inference
-            with torch.cuda.amp.autocast(enabled=False):
+            with torch.amp.autocast("cuda", enabled=False):
                 # with torch.autocast('cuda', enabled=False):
                 points = [point.float() for point in points]
                 feats, coords, sizes = self.voxelize(points)
                 batch_size = coords[-1, 0] + 1
         else:
-            # NOTE(knzo25): onnx inference. Voxelization happens outside the graph
-            with torch.cuda.amp.autocast(enabled=False):
+            # NOTE: (knzo25): onnx inference. Voxelization happens outside the graph
+            with torch.amp.autocast("cuda", enabled=False):
+                # with torch.autocast('cuda', enabled=False):
+
+                # NOTE(knzo25): onnx demmands this
+                # batch_size = coords[-1, 0] + 1
                 # with torch.autocast('cuda', enabled=False):
 
                 # NOTE(knzo25): onnx demmands this
                 # batch_size = coords[-1, 0] + 1
                 batch_size = 1
                 print("Run onnx point_eSpConvst")
-                assert self.voxelize_reduce
-                if self.voxelize_reduce:
-                    feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view(-1, 1)
+        feats = self.pts_voxel_encoder(feats, sizes, coords)
         x = self.pts_middle_encoder(feats, coords, batch_size)
         return x
 
@@ -241,11 +304,12 @@ def voxelize(self, points):
 
         feats = torch.cat(feats, dim=0)
         coords = torch.cat(coords, dim=0)
-        if len(sizes) > 0:
-            sizes = torch.cat(sizes, dim=0)
-            if self.voxelize_reduce:
-                feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view(-1, 1)
-                feats = feats.contiguous()
+        assert len(sizes) > 0, "No points in the voxel"
+        sizes = torch.cat(sizes, dim=0)
+
+        # if self.voxelize_reduce:
+        #     feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view(-1, 1)
+        #     feats = feats.contiguous()
 
         return feats, coords, sizes
 
@@ -281,7 +345,7 @@ def predict(
                 contains a tensor with shape (num_instances, 7).
         """
         batch_input_metas = [item.metainfo for item in batch_data_samples]
-        feats = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features)
+        feats, _ = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features)
 
         if self.with_bbox_head:
             outputs = self.bbox_head.predict(feats, batch_input_metas)
@@ -302,6 +366,7 @@ def extract_feat(
         features = []
 
         is_onnx_inference = False
+        pred_depths = None
         if imgs is not None and "lidar2img" not in batch_inputs_dict:
             # NOTE(knzo25): normal training and testing
             imgs = imgs.contiguous()
@@ -319,7 +384,13 @@ def extract_feat(
             camera2lidar = imgs.new_tensor(np.asarray(camera2lidar))
             img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix))
             lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix))
-            img_feature = self.extract_img_feat(
+            camera_depth_aware_parameters = self.prepare_camera_depth_aware_parameters(
+                camera_intrinsics=camera_intrinsics,
+                img_aug_matrix=img_aug_matrix,
+                lidar_aug_matrix=lidar_aug_matrix,
+                camera2lidar=camera2lidar,
+            )
+            img_feature, pred_depths = self.extract_img_feat(
                 imgs,
                 deepcopy(points),
                 lidar2image,
@@ -329,6 +400,7 @@ def extract_feat(
                 lidar_aug_matrix,
                 batch_input_metas,
                 using_image_features=using_image_features,
+                camera_depth_aware_parameters=camera_depth_aware_parameters,
             )
             features.append(img_feature)
         elif imgs is not None:
@@ -340,8 +412,10 @@ def extract_feat(
             img_aug_matrix = batch_inputs_dict["img_aug_matrix"]
             lidar_aug_matrix = batch_inputs_dict["lidar_aug_matrix"]
             geom_feats = batch_inputs_dict["geom_feats"]
+            # Retrieve the parameters from deployment code directly
+            camera_depth_aware_parameters = batch_inputs_dict["camera_depth_aware_parameters"]
 
-            img_feature = self.extract_img_feat(
+            img_feature, pred_depths = self.extract_img_feat(
                 imgs,
                 points,
                 lidar2image,
@@ -352,6 +426,7 @@ def extract_feat(
                 batch_input_metas,
                 geom_feats=geom_feats,
                 using_image_features=using_image_features,
+                camera_depth_aware_parameters=camera_depth_aware_parameters,
             )
             features.append(img_feature)
 
@@ -376,7 +451,7 @@ def extract_feat(
         if self.pts_neck is not None:
             x = self.pts_neck(x)
 
-        return x
+        return x, pred_depths
 
     def loss(
         self,
@@ -386,12 +461,145 @@ def loss(
         **kwargs,
     ) -> List[Det3DDataSample]:
         batch_input_metas = [item.metainfo for item in batch_data_samples]
-        feats = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features)
+        feats, pred_depths = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features)
 
         losses = dict()
+        if self.loss_depth_weight > 0 and pred_depths is not None:
+            with torch.amp.autocast("cuda", enabled=False):
+                gt_depths = torch.stack(
+                    [
+                        (
+                            meta["gt_depths"]
+                            if isinstance(meta["gt_depths"], torch.Tensor)
+                            else torch.as_tensor(meta["gt_depths"])
+                        )
+                        for meta in batch_input_metas
+                    ]
+                ).to(device=pred_depths.device, dtype=torch.float32)
+                depth_loss = self.get_depth_loss(gt_depths, pred_depths)
+                losses["loss_depth"] = depth_loss
+
         if self.with_bbox_head:
             bbox_loss = self.bbox_head.loss(feats, batch_data_samples)
-
-        losses.update(bbox_loss)
+            losses.update(bbox_loss)
 
         return losses
+
+    def _visualize_one_hot_gt_depth(
+        self,
+        gt_depths_one_hot: Tensor,
+        batch_size: int,
+        num_cameras: int,
+        height: int,
+        width: int,
+        batch_idx: int = 0,
+        num_channels: int = 6,
+    ) -> None:
+        """Save one-hot depth GT maps for the first batch and first few depth channels.
+
+        Args:
+            gt_depths_one_hot (Tensor): One-hot depth GT of shape [B*N*H*W, D].
+            batch_size (int): Batch size B from the original input.
+            num_cameras (int): Number of camera views N from the original input.
+            height (int): Original input height H before downsampling.
+            width (int): Original input width W before downsampling.
+            batch_idx (int): Batch index to visualize.
+            num_channels (int): Number of depth-bin channels to visualize.
+        """
+        if self.visualize_gt_depth_dir is None:
+            return
+
+        if dist.is_available() and dist.is_initialized() and dist.get_rank() != 0:
+            return
+
+        if batch_size <= batch_idx or num_cameras == 0:
+            return
+
+        downsample = self.depth_gt_downsample
+        height_down = height // downsample
+        width_down = width // downsample
+        num_depth_bins = gt_depths_one_hot.shape[1]
+
+        num_channels = min(num_channels, num_depth_bins)
+        if num_channels == 0 or height_down == 0 or width_down == 0:
+            return
+
+        with torch.no_grad():
+            one_hot = gt_depths_one_hot.view(batch_size, num_cameras, height_down, width_down, num_depth_bins)
+            depth_channels = one_hot[batch_idx, 0, :, :, :num_channels].detach().float().cpu().numpy()
+
+        ncols = min(3, num_channels)
+        nrows = math.ceil(num_channels / ncols)
+        fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows), squeeze=False)
+
+        dbounds = self.view_transform.dbound
+        for ch_idx in range(num_channels):
+            ax = axes[ch_idx // ncols, ch_idx % ncols]
+            channel_map = depth_channels[:, :, ch_idx]
+            depth_m = dbounds[0] + (ch_idx + 0.5) * dbounds[2]
+            im = ax.imshow(channel_map, cmap="viridis", vmin=0, vmax=1, interpolation="nearest")
+            ax.set_title(f"batch {batch_idx}, depth bin {ch_idx} (~{depth_m:.1f}m)")
+            ax.set_xticks([])
+            ax.set_yticks([])
+            fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+
+        for ch_idx in range(num_channels, nrows * ncols):
+            axes[ch_idx // ncols, ch_idx % ncols].axis("off")
+
+        fig.suptitle(f"one-hot gt_depth (batch={batch_idx}, cam=0, bins=0-{num_channels - 1})")
+        fig.tight_layout()
+
+        if not hasattr(self, "_gt_depth_one_hot_vis_count"):
+            self._gt_depth_one_hot_vis_count = 0
+        self._gt_depth_one_hot_vis_count += 1
+        save_path = self.visualize_gt_depth_dir / f"gt_depth_one_hot_{self._gt_depth_one_hot_vis_count:06d}.png"
+        fig.savefig(save_path, dpi=150, bbox_inches="tight")
+        plt.close(fig)
+        print_log(f"Saved one-hot gt_depth visualization to {save_path.resolve()}")
+
+    def get_downsampled_gt_depth(self, gt_depths):
+        """
+        Input:
+            gt_depths: [B, N, H, W]
+        Output:
+            gt_depths: [B*N*h*w, d]
+        """
+        B, N, H, W = gt_depths.shape
+        D = self.view_transform.D
+        dbounds = self.view_transform.dbound
+        gt_depths = gt_depths.view(
+            B * N,
+            H // self.depth_gt_downsample,
+            self.depth_gt_downsample,
+            W // self.depth_gt_downsample,
+            self.depth_gt_downsample,
+            1,
+        )
+        gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous()
+        gt_depths = gt_depths.view(-1, self.depth_gt_downsample * self.depth_gt_downsample)
+        gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths)
+        gt_depths = torch.min(gt_depths_tmp, dim=-1).values
+        gt_depths = gt_depths.view(B * N, H // self.depth_gt_downsample, W // self.depth_gt_downsample)
+
+        gt_depths = (gt_depths - (dbounds[0] - dbounds[2])) / dbounds[2]
+        # gt_depths = torch.where(gt_depths >= 0.0, gt_depths, torch.zeros_like(gt_depths))
+        # gt_depths = torch.clamp(gt_depths, max=float(D))
+        gt_depths = torch.where((gt_depths >= 0.0) & (gt_depths < D + 1), gt_depths, torch.zeros_like(gt_depths))
+        # gt_depths = torch.clamp(gt_depths, max=float(D))
+        gt_depths = F.one_hot(gt_depths.long(), num_classes=D + 1).view(-1, D + 1)[:, 1:]
+        self._visualize_one_hot_gt_depth(gt_depths, B, N, H, W)
+        return gt_depths.float()
+
+    def get_depth_loss(self, depth_labels, depth_preds):
+        depth_labels = self.get_downsampled_gt_depth(depth_labels)
+        # (B, N, D, H, W) -> (B*N*H*W, D)
+        depth_preds = depth_preds.permute(0, 1, 3, 4, 2).contiguous().view(-1, self.view_transform.D)
+        fg_mask = torch.max(depth_labels, dim=1).values > 0.0
+        depth_labels = depth_labels[fg_mask]
+        depth_preds = depth_preds[fg_mask]
+        depth_loss = F.binary_cross_entropy(
+            depth_preds,
+            depth_labels,
+            reduction="none",
+        ).sum() / max(1.0, fg_mask.sum())
+        return self.loss_depth_weight * depth_loss
diff --git a/projects/BEVFusion/bevfusion/bevfusion_head.py b/projects/BEVFusion/bevfusion/bevfusion_head.py
index c37c5a538..8dc4bce45 100644
--- a/projects/BEVFusion/bevfusion/bevfusion_head.py
+++ b/projects/BEVFusion/bevfusion/bevfusion_head.py
@@ -17,6 +17,8 @@
 from mmengine.structures import InstanceData
 from torch import nn
 
+from .ops.topk.topk import topk
+
 
 def clip_sigmoid(x, eps=1e-4):
     y = torch.clamp(x.sigmoid_(), min=eps, max=1 - eps)
@@ -26,11 +28,18 @@ def clip_sigmoid(x, eps=1e-4):
 @MODELS.register_module()
 class ConvFuser(nn.Sequential):
 
-    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, padding: int) -> None:
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int, padding: int) -> None:
         self.in_channels = in_channels
         self.out_channels = out_channels
         super().__init__(
-            nn.Conv2d(sum(in_channels), out_channels, kernel_size, padding, bias=False),
+            nn.Conv2d(
+                sum(in_channels),
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                bias=False,
+            ),
             nn.BatchNorm2d(out_channels),
             nn.ReLU(True),
         )
@@ -63,6 +72,7 @@ def __init__(
         norm_cfg=dict(type="BN1d"),
         bias="auto",
         # loss
+        loss_iou=None,
         loss_cls=dict(type="mmdet.GaussianFocalLoss", reduction="mean"),
         loss_bbox=dict(type="mmdet.L1Loss", reduction="mean"),
         loss_heatmap=dict(type="mmdet.GaussianFocalLoss", reduction="mean"),
@@ -88,8 +98,10 @@ def __init__(
         if not self.use_sigmoid_cls:
             self.num_classes += 1
         self.loss_cls = MODELS.build(loss_cls)
+        self.loss_iou = MODELS.build(loss_iou) if loss_iou is not None else None
         self.loss_bbox = MODELS.build(loss_bbox)
         self.loss_heatmap = MODELS.build(loss_heatmap)
+        self.share_conv_out_channels = hidden_channel
 
         self.bbox_coder = build_bbox_coder(bbox_coder)
         self.sampling = False
@@ -155,7 +167,11 @@ def __init__(
         # Position Embedding for Cross-Attention, which is re-used during training # noqa: E501
         x_size = self.test_cfg["grid_size"][0] // self.test_cfg["out_size_factor"]
         y_size = self.test_cfg["grid_size"][1] // self.test_cfg["out_size_factor"]
-        self.bev_pos = self.create_2D_grid(x_size, y_size)
+        self.spatial_dim = x_size * y_size
+        bev_pos = self.create_2D_grid(x_size, y_size)
+
+        # Register the bev_pos as a buffer so it moves to the GPU automatically.
+        self.register_buffer("bev_pos", bev_pos, persistent=False)  # (1, H * W, 2)
 
         self.img_feat_pos = None
         self.img_feat_collapsed_pos = None
@@ -174,11 +190,22 @@ def __init__(
             self.dense_heatmap_exclude_pooling_classes = sorted(
                 list(set(self.class_name_to_indices.values()) - set(self.dense_heatmap_pooling_class_indices))
             )
+            # Pre-compute the correct order of the classes for the final local_max
+            heatmap_concat_order = (
+                self.dense_heatmap_pooling_class_indices + self.dense_heatmap_exclude_pooling_classes
+            )
+            local_concat_class_remapping = [heatmap_concat_order.index(i) for i in range(self.num_classes)]
         else:
             self.dense_heatmap_pooling_class_indices = None
             self.dense_heatmap_exclude_pooling_classes = None
+            local_concat_class_remapping = [i for i in range(self.num_classes)]
 
+        # Register the remapping as a buffer so it moves to the GPU automatically and gets saved in the state_dict.
+        self.register_buffer(
+            "local_concat_class_remapping", torch.tensor(local_concat_class_remapping), persistent=False
+        )
         self.local_heatmap_padding = self.nms_kernel_size // 2
+
         # NMS clusters
         self.nms_clusters = self.test_cfg.get("nms_clusters", [])
         # Add class indices for nms
@@ -199,7 +226,8 @@ def __init__(
             self.partial_ignore_labels = None
 
         print_log(f"BEVFusionHead Partial ignore labels: {self.partial_ignore_labels}, dense heatmap pooling classes: \
-        {self.dense_heatmap_pooling_classes}, class_names: {self.class_names}", logger="current")
+        {self.dense_heatmap_pooling_classes}, class_names: {self.class_names}, \
+        local_concat_class_remapping: {self.local_concat_class_remapping}", logger="current")
 
     def create_2D_grid(self, x_size, y_size):
         meshgrid = [[0, x_size - 1, x_size], [0, y_size - 1, y_size]]
@@ -247,24 +275,21 @@ def forward_single(self, inputs, metas):
         Returns:
             list[dict]: Output results for tasks.
         """
-        batch_size = inputs.shape[0]
+        # batch_size = inputs.shape[0]
         fusion_feat = self.shared_conv(inputs)
 
         #################################
         # image to BEV
         #################################
-        fusion_feat_flatten = fusion_feat.view(batch_size, fusion_feat.shape[1], -1)  # [BS, C, H*W]
-        bev_pos = self.bev_pos.repeat(batch_size, 1, 1).to(fusion_feat.device)
+        fusion_feat_flatten = fusion_feat.view(-1, self.share_conv_out_channels, self.spatial_dim)  # [BS, C, H*W]
 
         #################################
         # query initialization
         #################################
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast("cuda", enabled=False):
             # with torch.autocast('cuda', enabled=False):
             dense_heatmap = self.heatmap_head(fusion_feat.float())
         heatmap = dense_heatmap.detach().sigmoid()
-        local_max = torch.zeros_like(heatmap)
-        # equals to nms radius = voxel_size * out_size_factor * kenel_size
         if self.dense_heatmap_pooling_class_indices is not None:
             # Pooling
             selected_heatmap = heatmap[:, self.dense_heatmap_pooling_class_indices, :, :]
@@ -274,29 +299,44 @@ def forward_single(self, inputs, metas):
                 stride=1,
                 padding=0,
             )
-            local_max[
-                :,
-                self.dense_heatmap_pooling_class_indices,
-                self.local_heatmap_padding : (-self.local_heatmap_padding),
-                self.local_heatmap_padding : (-self.local_heatmap_padding),
-            ] = local_max_inner
-            # Non-pooling classes
+
+            # 2. Restore spatial size using F.pad instead of slice mutation
+            local_max = F.pad(
+                local_max_inner,
+                (
+                    self.local_heatmap_padding,
+                    self.local_heatmap_padding,
+                    self.local_heatmap_padding,
+                    self.local_heatmap_padding,
+                ),
+                mode="constant",
+                value=0.0,
+            )
+
+            # 3. Any non-pooling classes
             if self.dense_heatmap_exclude_pooling_classes:
-                local_max[:, self.dense_heatmap_exclude_pooling_classes] = heatmap[
-                    :, self.dense_heatmap_exclude_pooling_classes
-                ]
+                excluded_local_max = heatmap[:, self.dense_heatmap_exclude_pooling_classes, :, :]
+                local_max = torch.cat([local_max, excluded_local_max], dim=1)
+                local_max = local_max[:, self.local_concat_class_remapping, :, :]
         else:
             local_max = heatmap
 
         heatmap = heatmap * (heatmap == local_max)
-        heatmap = heatmap.view(batch_size, heatmap.shape[1], -1)
+        # (BS, num_classes, H*W)
+        heatmap = heatmap.view(-1, self.num_classes, self.spatial_dim)
 
         # top num_proposals among all classes
-        top_proposals = heatmap.view(batch_size, -1).argsort(dim=-1, descending=True)[..., : self.num_proposals]
-        top_proposals_class = top_proposals // heatmap.shape[-1]
-        top_proposals_index = top_proposals % heatmap.shape[-1]
+        flattened_heatmap = heatmap.view(-1, self.num_classes * self.spatial_dim)
+
+        # Use topk instead of argsort to avoid sorting the entire flattened heatmap.
+        top_proposals = topk(x=flattened_heatmap, k=self.num_proposals, dim=-1, sorted=False)
+
+        # 2. Calculate class and spatial indices
+        # Use shape[-1] dynamically to handle grid sizes safely.
+        top_proposals_class = top_proposals // self.spatial_dim
+        top_proposals_index = top_proposals % self.spatial_dim
         query_feat = fusion_feat_flatten.gather(
-            index=top_proposals_index[:, None, :].expand(-1, fusion_feat_flatten.shape[1], -1),
+            index=top_proposals_index[:, None, :].expand(-1, self.share_conv_out_channels, -1),
             dim=-1,
         )
         self.query_labels = top_proposals_class
@@ -306,10 +346,8 @@ def forward_single(self, inputs, metas):
         query_cat_encoding = self.class_encoding(one_hot.float())
         query_feat += query_cat_encoding
 
-        query_pos = bev_pos.gather(
-            index=top_proposals_index[:, None, :].permute(0, 2, 1).expand(-1, -1, bev_pos.shape[-1]),
-            dim=1,
-        )
+        # (B, N, 2)
+        query_pos = self.bev_pos.squeeze(0)[top_proposals_index]
         #################################
         # transformer decoder layer (Fusion feature as K,V)
         #################################
@@ -317,7 +355,9 @@ def forward_single(self, inputs, metas):
         for i in range(self.num_decoder_layers):
             # Transformer Decoder Layer
             # :param query: B C Pq    :param query_pos: B Pq 3/6
-            query_feat = self.decoder[i](query_feat, key=fusion_feat_flatten, query_pos=query_pos, key_pos=bev_pos)
+            query_feat = self.decoder[i](
+                query_feat, key=fusion_feat_flatten, query_pos=query_pos, key_pos=self.bev_pos
+            )
 
             # Prediction
             res_layer = self.prediction_heads[i](query_feat)
@@ -384,8 +424,10 @@ def predict_by_feat(self, preds_dicts, metas, img=None, rescale=False, for_roi=F
         for layer_id, preds_dict in enumerate(preds_dicts):
             batch_size = preds_dict[0]["heatmap"].shape[0]
             batch_score = preds_dict[0]["heatmap"][..., -self.num_proposals :].sigmoid()
-            # if self.loss_iou.loss_weight != 0:
-            #    batch_score = torch.sqrt(batch_score * preds_dict[0]['iou'][..., -self.num_proposals:].sigmoid()) # noqa: E501
+            if self.loss_iou is not None:
+                batch_score = torch.sqrt(
+                    batch_score * preds_dict[0]["iou"][..., -self.num_proposals :].sigmoid()
+                )  # noqa: E501
             one_hot = F.one_hot(self.query_labels, num_classes=self.num_classes).permute(0, 2, 1)
             batch_score = batch_score * preds_dict[0]["query_heatmap_score"] * one_hot
 
@@ -433,6 +475,7 @@ def predict_by_feat(self, preds_dicts, metas, img=None, rescale=False, for_roi=F
                                     circle_nms(
                                         boxes_for_nms.detach().cpu().numpy(),
                                         nms_cluster["nms_threshold"],
+                                        post_max_size=nms_cluster["post_max_size"],
                                     )
                                 )
                             else:
@@ -780,8 +823,9 @@ def loss_by_feat(
             for cls_i, class_name in enumerate(self.class_names):
                 loss_dict[f"loss_heatmap_{class_name}"] = loss_heatmap_cls[cls_i]
 
-            # Prevent loss item to avoid computing gradients twice. This is for logging.
-            loss_dict["total_dense_heatmap"] = loss_heatmap_cls.sum()
+            # Logging-only aggregate. Detach so it does not retain the autograd graph;
+            # the per-class `loss_heatmap_{class_name}` entries are what drive gradients.
+            loss_dict["total_dense_heatmap"] = loss_heatmap_cls.sum().detach()
 
         # compute loss for each layer
         for idx_layer in range(self.num_decoder_layers if self.auxiliary else 1):
@@ -855,7 +899,22 @@ def loss_by_feat(
 
             loss_dict[f"{prefix}_loss_cls"] = layer_loss_cls
             loss_dict[f"{prefix}_loss_bbox"] = layer_loss_bbox
-            # loss_dict[f'{prefix}_loss_iou'] = layer_loss_iou
+
+            # Output iou for iou-aware loss
+            if self.loss_iou is not None:
+                layer_ious = preds_dict["iou"][
+                    ...,
+                    idx_layer * self.num_proposals : (idx_layer + 1) * self.num_proposals,
+                ].squeeze(
+                    1
+                )  # [BS, num_proposals]
+
+                # [BS, num_proposals]
+                layer_iou_weights = layer_bbox_weights[:, :, 0]
+                #   print(layer_ious.shape, ious.shape, layer_iou_weights.shape, "layer_ious.shape, ious.shape, layer_iou_weights.shape")
+                loss_dict[f"{prefix}_loss_iou"] = self.loss_iou(
+                    layer_ious, ious, layer_iou_weights, avg_factor=max(num_pos, 1)
+                )
 
         loss_dict["matched_ious"] = layer_loss_cls.new_tensor(matched_ious)
 
diff --git a/projects/BEVFusion/bevfusion/bevfusion_voxel_encoder.py b/projects/BEVFusion/bevfusion/bevfusion_voxel_encoder.py
new file mode 100644
index 000000000..d7801482a
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/bevfusion_voxel_encoder.py
@@ -0,0 +1,400 @@
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+from mmdet3d.models.voxel_encoders.utils import PFNLayer, get_paddings_indicator
+from mmdet3d.registry import MODELS
+from torch import Tensor, nn
+
+
+@MODELS.register_module()
+class HardSimpleVoxelSinCosEncoder(nn.Module):
+    def __init__(
+        self, min_norm_values: Tuple[float], max_norm_values: Tuple[float], in_channels: Optional[int] = 4
+    ) -> None:
+        """
+        Simple voxel encoder that only performs mean pooling on the normalize features, and then
+        performs sin-cos (fourier encoding) on each voxel channels.
+
+        The output shape of each voxel is (N, feature_channels*2).
+        Args:
+            min_norm_values (Tuple[float]): Minimum values for the features.
+            max_norm_values (Tuple[float]): Maximum values for the features.
+            in_channels (int): Number of input channels.
+        """
+        super().__init__()
+
+        # Create PillarFeatureNet layers
+        self.in_channels = in_channels
+
+        # Convert the ((x - min) / (max - min)) * pi * exponents to x * scale + bias for folding them into one OP
+        min_norm_values = torch.tensor(min_norm_values)
+        max_norm_values = torch.tensor(max_norm_values)
+        # Let alpha = pi * exponents, beta = max - min
+        # y = ((x - min) / beta) * alpha
+        # y = alpha / beta * (x - min)
+        # y = (alpha / beta) * x - (alpha / beta) * min
+        # Therefore, scale = alpha / beta, bias = - (alpha * min) / beta
+        # y = scale * x + bias
+        exponents = (2 ** torch.arange(0, self.in_channels)).float()
+        alpha = (torch.pi * exponents).unsqueeze(0)  # (1, C)
+        beta = (max_norm_values - min_norm_values).unsqueeze(1)  # (C, 1)
+        scale = alpha / beta
+        bias = -(alpha * min_norm_values.unsqueeze(1)) / beta  # (C, C)
+
+        self.register_buffer("exponent_scale", scale.unsqueeze(0), persistent=False)  # (1, C, C)
+        self.register_buffer("exponent_bias", bias.unsqueeze(0), persistent=False)  # (1, C, C)
+
+    def forward(self, features: Tensor, num_points: Tensor, coors: Tensor, *args, **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): Point features or raw points in shape
+                (N, M, C) in (x, y, z, intensity, time_lag) if C is 5, (x, y, z, time_lag) if C is 4.
+            num_points (torch.Tensor): Number of points in each pillar in shape (M).
+            coors (torch.Tensor): Coordinates of each voxel in (M, [4]), which is (batch_idx, z_idx, y_idx, x_idx).
+
+        Returns:
+            torch.Tensor: Features of pillars in shape (M, C*C*2).
+
+        """
+        # Mean in the voxel
+        # (N, M, C) -> (N, C)
+        voxel_mean_features = (
+            features.sum(dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1)
+        ).contiguous()
+
+        # x * scale + bias, (1, C, C) + (1, C, C) * (N, C, 1) -> (N, C, C)
+        # FMA (fused multiply-add): y = bias + scale * voxel_mean_features
+        y = torch.addcmul(self.exponent_bias, self.exponent_scale, voxel_mean_features.unsqueeze(-1))
+        # SinCos encoding
+        # (N*C, C) -> (N, C*C)
+        y = y.reshape(-1, self.in_channels * self.in_channels)
+        # (N, C*C) -> (N, C*C*2)
+        voxel_fourier_features = torch.cat([torch.cos(y), torch.sin(y)], dim=1)
+
+        return voxel_fourier_features
+
+
+@MODELS.register_module()
+class BEVFusionVoxelFeatureNet(HardSimpleVoxelSinCosEncoder):
+    """BEVFusion Voxel Encoder Feature Net.
+
+    The network is same as HardSimpleVoxelSinCosEncoder, but it performs PFNLayers on the
+    offset features, for example, distances. After that, it concatenates the fourier features and the PFN features
+    along the channel dimension for each voxel.
+
+    Args:
+        min_norm_values (Tuple[float]): Minimum values for the features.
+        max_norm_values (Tuple[float]): Maximum values for the features.
+        in_channels (int): Number of input channels.
+        feat_channels (tuple, optional): Number of features in each of the
+            N PFNLayers. Defaults to (64, ).
+    """
+
+    def __init__(
+        self,
+        min_norm_values: Optional[Tuple[float]] = None,
+        max_norm_values: Optional[Tuple[float]] = None,
+        in_channels: Optional[int] = 4,
+        feat_channels: Optional[tuple] = (64,),
+        with_distance: Optional[bool] = False,
+        with_cluster_center: Optional[bool] = True,
+        with_voxel_center: Optional[bool] = True,
+        voxel_size: Optional[Tuple[float]] = (0.2, 0.2, 4),
+        point_cloud_range: Optional[Tuple[float]] = (0, -40, -3, 70.4, 40, 1),
+        norm_cfg: Optional[dict] = dict(type="BN1d", eps=1e-3, momentum=0.01),
+        mode: Optional[str] = "max",
+        legacy: Optional[bool] = True,
+    ):
+
+        super(BEVFusionVoxelFeatureNet, self).__init__(
+            min_norm_values=min_norm_values, max_norm_values=max_norm_values, in_channels=in_channels
+        )
+        assert len(feat_channels) > 0
+        self.legacy = legacy
+        pfn_in_channels = in_channels
+        if with_cluster_center:
+            pfn_in_channels += 3
+        if with_voxel_center:
+            pfn_in_channels += 3
+        if with_distance:
+            pfn_in_channels += 1
+
+        assert pfn_in_channels > 0, "pfn_in_channels must be greater than 0"
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+
+        # Create VoxelFeatureNet layers
+        feat_channels = [pfn_in_channels] + list(feat_channels)
+        pfn_layers = []
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            if i < len(feat_channels) - 2:
+                last_layer = False
+            else:
+                last_layer = True
+            pfn_layers.append(PFNLayer(in_filters, out_filters, norm_cfg=norm_cfg, last_layer=last_layer, mode=mode))
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+
+        # Need pillar (voxel) size and x/y offset in order to calculate offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+        self.point_cloud_range = point_cloud_range
+
+    def forward(self, features: Tensor, num_points: Tensor, coors: Tensor, *args, **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): Point features or raw points in shape
+                (N, M, C) in (x, y, z, intensity, time_lag) if C is 5, (x, y, z, time_lag) if C is 4.
+            num_points (torch.Tensor): Number of points in each pillar in shape (M).
+            coors (torch.Tensor): Coordinates of each voxel in (M, [4]), which is (batch_idx, z_idx, y_idx, x_idx).
+
+        Returns:
+            torch.Tensor: Features of pillars in shape (M, C*C*2 + feat_channels[-1]).
+        """
+        # (M, C*C*2)
+        voxel_fourier_features = super().forward(features, num_points, coors)
+
+        # Normalize the features
+        norm_features = (features - self.min_norm_values.view(1, -1)) / (
+            (self.max_norm_values - self.min_norm_values).view(1, -1)
+        )
+
+        # Offset features
+        max_points_per_voxel = features.shape[1]
+
+        features_ls = [norm_features]
+        # Find distance of x, y, and z from cluster center, mapped to [-1,   1] if available
+        if self._with_cluster_center:
+            points_mean = features[:, :, :3].sum(dim=1, keepdim=True) / num_points.type_as(features).view(-1, 1, 1)
+            f_cluster = features[:, :, :3] - points_mean
+            # Map to [0, 1] if available
+            # if self.min_norm_values is not None and self.max_norm_values is not None:
+            #     voxel_size = features.new_tensor([self.vx, self.vy, self.vz])
+            #     f_cluster = f_cluster / voxel_size
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        dtype = features.dtype
+        if self._with_voxel_center:
+            if not self.legacy:
+                f_center = torch.zeros_like(features[:, :, :3])
+                f_center[:, :, 0] = features[:, :, 0] - (coors[:, 3].to(dtype).unsqueeze(1) * self.vx + self.x_offset)
+                f_center[:, :, 1] = features[:, :, 1] - (coors[:, 2].to(dtype).unsqueeze(1) * self.vy + self.y_offset)
+                f_center[:, :, 2] = features[:, :, 2] - (coors[:, 1].to(dtype).unsqueeze(1) * self.vz + self.z_offset)
+            else:
+                f_center = features[:, :, :3]
+                f_center[:, :, 0] = f_center[:, :, 0] - (
+                    coors[:, 3].type_as(features).unsqueeze(1) * self.vx + self.x_offset
+                )
+                f_center[:, :, 1] = f_center[:, :, 1] - (
+                    coors[:, 2].type_as(features).unsqueeze(1) * self.vy + self.y_offset
+                )
+                f_center[:, :, 2] = f_center[:, :, 2] - (
+                    coors[:, 1].type_as(features).unsqueeze(1) * self.vz + self.z_offset
+                )
+
+            # if self.min_norm_values is not None and self.max_norm_values is not None:
+            #     f_center = f_center / (voxel_size * 0.5)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        voxel_feature_offsets = torch.cat(features_ls, dim=-1)
+
+        # The feature decorations were calculated without regard to whether
+        # pillar was empty. Need to ensure that
+        # empty pillars remain set to zeros.
+        mask = get_paddings_indicator(num_points, max_points_per_voxel, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(voxel_feature_offsets)
+        voxel_feature_offsets *= mask
+
+        # PFN
+        for pfn in self.pfn_layers:
+            voxel_feature_offsets = pfn(voxel_feature_offsets, num_points)
+
+        # Concat
+        features = torch.cat([voxel_fourier_features, voxel_feature_offsets.squeeze(1)], dim=-1)
+
+        return features
+
+
+# @MODELS.register_module()
+# class BEVFusionVoxelSinCosEncoder(nn.Module):
+#     def __init__(self,
+#                  min_norm_values: Tuple[float],
+#                  max_norm_values: Tuple[float],
+#                  time_lag_channel_index: int = 3,
+#                  time_exp_factor: Optional[float] = None,
+#                  feat_channels: Optional[tuple] = (16, ),
+#                  in_channels: Optional[int] = 4,
+#                  with_distance: Optional[bool] = False,
+#                  with_cluster_center: Optional[bool] = True,
+#                  with_voxel_center: Optional[bool] = True,
+#                  voxel_size: Optional[Tuple[float]] = (0.2, 0.2, 4),
+#                  point_cloud_range: Optional[Tuple[float]] = (0, -40, -3, 70.4,
+#                                                               40, 1),
+#                  norm_cfg: Optional[dict] = dict(
+#                      type='BN1d', eps=1e-3, momentum=0.01),
+#                  mode: Optional[str] = 'max'):
+#         super(BEVFusionVoxelSinCosEncoder, self).__init__()
+
+#         self._with_distance = with_distance
+#         self._with_cluster_center = with_cluster_center
+#         self._with_voxel_center = with_voxel_center
+#         # Create PillarFeatureNet layers
+#         self.in_channels = in_channels
+
+#         # Need pillar (voxel) size and x/y offset in order to calculate offset
+#         self.vx = voxel_size[0]
+#         self.vy = voxel_size[1]
+#         self.vz = voxel_size[2]
+#         self.x_offset = self.vx / 2 + point_cloud_range[0]
+#         self.y_offset = self.vy / 2 + point_cloud_range[1]
+#         self.z_offset = self.vz / 2 + point_cloud_range[2]
+#         self.point_cloud_range = point_cloud_range
+
+#         self.xyz_channels = 3
+#         feat_offset_channels = in_channels - self.xyz_channels
+#         if with_cluster_center:
+#             feat_offset_channels += 3
+#         if with_voxel_center:
+#             feat_offset_channels += 3
+#         if with_distance:
+#             feat_offset_channels += 1
+
+#         feat_channels = [feat_offset_channels] + list(feat_channels)
+#         assert len(feat_channels) > 0, "feat_channels must be greater than 0"
+#         pfn_layers = []
+#         for i in range(len(feat_channels) - 1):
+#             in_filters = feat_channels[i]
+#             out_filters = feat_channels[i + 1]
+#             if i < len(feat_channels) - 2:
+#                 last_layer = False
+#             else:
+#                 last_layer = True
+#             pfn_layers.append(
+#                 PFNLayer(
+#                     in_filters,
+#                     out_filters,
+#                     norm_cfg=norm_cfg,
+#                     last_layer=last_layer,
+#                     mode=mode))
+#         self.pfn_layers = nn.ModuleList(pfn_layers)
+
+#         self.time_lag_channel_index = time_lag_channel_index
+#         self.time_exp_factor = time_exp_factor
+
+#         self.register_buffer("min_norm_values", torch.tensor(min_norm_values))
+#         self.register_buffer("max_norm_values", torch.tensor(max_norm_values))
+#         self.register_buffer("voxel_size", torch.tensor([self.vx, self.vy, self.vz]))
+#         self.register_buffer("exponents", (2 ** torch.arange(0, self.xyz_channels)).float())
+
+#     def forward(self, features: Tensor, num_points: Tensor, coors: Tensor,
+#                 *args, **kwargs) -> Tensor:
+#         """Forward function.
+
+#         Args:
+#             features (torch.Tensor): Point features or raw points in shape
+#                 (N, M, C).
+#             num_points (torch.Tensor): Number of points in each pillar in shape (M).
+#             coors (torch.Tensor): Coordinates of each voxel in (M, [4]), which is (batch_idx, z_idx, y_idx, x_idx).
+
+#         Returns:
+#             torch.Tensor: Features of pillars in shape (M, C).
+#         """
+#         num_voxels, max_points_per_voxel = features.shape[0], features.shape[1]
+
+#         # Mean in the voxel
+#         # (N, M, 3) -> (N, 3)
+#         voxel_features = (features[:, :, :self.xyz_channels].sum(dim=1, keepdim=False) / num_points.type_as(features).view(
+#                     -1, 1)).contiguous()
+
+#         # min-max normalization, (N, 3) -> (N, 3)
+#         voxel_features_norm = (voxel_features - \
+#          self.min_norm_values[:self.xyz_channels].view(1, -1)) / ((self.max_norm_values[:self.xyz_channels] - self.min_norm_values[:self.xyz_channels]).view(1, -1))
+
+#         # SinCos encoding
+#         # (N, 3) -> (N*3, 1) * (1, ) * (1, 3) -> (N*3, 3)
+#         y = voxel_features_norm.reshape(-1, 1) * np.pi * self.exponents.reshape(1, -1)
+#         # (N*3, 3) -> (N, 3*3)
+#         y = y.reshape(num_voxels, -1)
+#         # (N, 3*3) -> (N, 3*3*2)
+#         voxel_fourier_features = torch.cat([torch.cos(y), torch.sin(y)], dim=1)
+
+#         # PFN
+#         # Other features, for example, intensity or time_lag
+#         other_features = features[:, :, self.xyz_channels:]
+
+#         # Normalization
+#         other_features_norm = (other_features - self.min_norm_values[self.xyz_channels:]) / (self.max_norm_values[self.xyz_channels:] - self.min_norm_values[self.xyz_channels:])
+
+#         time_lag_feature_index = self.time_lag_channel_index - self.xyz_channels
+#         # exponentiate time_lag features, it's higher when the normlized time lag is lower
+#         # (1.0 when time_lag_features is 0.0)
+#         if self.time_exp_factor is not None:
+#             other_features_norm[:, :, time_lag_feature_index] = torch.exp(- other_features_norm[:, :, time_lag_feature_index] * self.time_exp_factor)
+#         else:
+#             # Inverse the time_lag feature
+#             other_features_norm[:, :, time_lag_feature_index] = 1.0 - other_features_norm[:, :, time_lag_feature_index]
+
+#         # Offsets
+#         voxel_feature_offsets = [other_features_norm]
+#         # Find distance of x, y, and z from cluster center
+#         if self._with_cluster_center:
+#             points_mean = features[:, :, :3].sum(
+#                 dim=1, keepdim=True) / num_points.type_as(features).view(
+#                     -1, 1, 1)
+
+#             # f_cluster = (features[:, :, :3] - points_mean)
+#             f_cluster = features[:, :, :3] - points_mean
+#             voxel_feature_offsets.append(f_cluster)
+
+#         # Find distance of x, y, and z from pillar center
+#         dtype = features.dtype
+#         if self._with_voxel_center:
+#             f_center = torch.zeros_like(features[:, :, :3])
+#             f_center[:, :, 0] = features[:, :, 0] - (
+#                 coors[:, 3].to(dtype).unsqueeze(1) * self.vx +
+#                 self.x_offset)
+#             f_center[:, :, 1] = features[:, :, 1] - (
+#                 coors[:, 2].to(dtype).unsqueeze(1) * self.vy +
+#                 self.y_offset)
+#             f_center[:, :, 2] = features[:, :, 2] - (
+#                 coors[:, 1].to(dtype).unsqueeze(1) * self.vz +
+#                 self.z_offset)
+
+#             # Map to [-1, 1]
+#             # f_center = f_center / (self.voxel_size * 0.5)
+#             voxel_feature_offsets.append(f_center)
+
+#         if self._with_distance:
+#             points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+#             voxel_feature_offsets.append(points_dist)
+
+#         voxel_feature_offsets = torch.cat(voxel_feature_offsets, dim=-1)
+#         # The feature decorations were calculated without regard to whether
+#         # pillar was empty. Need to ensure that
+#         # empty pillars remain set to zeros.
+#         mask = get_paddings_indicator(num_points, max_points_per_voxel, axis=0)
+#         mask = torch.unsqueeze(mask, -1).type_as(voxel_feature_offsets)
+#         voxel_feature_offsets *= mask
+
+#         # PFN
+#         for pfn in self.pfn_layers:
+#             voxel_feature_offsets = pfn(voxel_feature_offsets, num_points)
+
+#         # Concat
+#         features = torch.cat([voxel_fourier_features, voxel_feature_offsets.squeeze(1)], dim=-1)
+#         return features
diff --git a/projects/BEVFusion/bevfusion/custom_sparse_conv_tensor.py b/projects/BEVFusion/bevfusion/custom_sparse_conv_tensor.py
new file mode 100644
index 000000000..888d2b1c0
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/custom_sparse_conv_tensor.py
@@ -0,0 +1,37 @@
+"""
+Custom SparseConvTensor for BEVFusion.
+This customiztion is used to support cleaner ONNX export of sparse convolutions.
+"""
+
+import torch
+from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import SparseConvTensor
+else:
+    from mmcv.ops import SparseConvTensor
+
+
+def sparse_to_dense(sparse_tensor: SparseConvTensor, batch_size: int, spatial_shapes: list[int], out_channels: int):
+    """
+    Convert the sparse tensor to a dense tensor.
+    """
+    H, W, D = spatial_shapes
+    num_cells = batch_size * H * W * D
+    idx = sparse_tensor.indices.to(sparse_tensor.features.device).long()  # [N, 1+D]
+    b, h, w, d = idx.unbind(1)
+    # b * (H * W * D) + h*(W*D) + w*D + d
+    # Factor out the common terms D and W
+    # (b*H*W + h*W + w) * D + d -> (b*H + h) * W + w) * D + d
+    linear_idx = ((b * H + h) * W + w) * D + d  # [N]
+
+    out = torch.zeros(
+        [num_cells, sparse_tensor.features.shape[1]],
+        device=sparse_tensor.features.device,
+        dtype=sparse_tensor.features.dtype,
+    )
+    # out = out.index_copy(0, linear_idx, sparse_tensor.features)
+    # out = out.scatter(0, linear_idx, sparse_tensor.features)
+    scatter_idx = linear_idx.unsqueeze(1).expand(-1, out_channels)  # [N, C]
+    out = out.scatter(0, scatter_idx, sparse_tensor.features)
+    return out.view(batch_size, H, W, D, out_channels)
diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py
index ac7c5b503..d0a547258 100644
--- a/projects/BEVFusion/bevfusion/depth_lss.py
+++ b/projects/BEVFusion/bevfusion/depth_lss.py
@@ -1,8 +1,13 @@
 # modify from https://github.com/mit-han-lab/bevfusion
+import math
+from pathlib import Path
 from typing import Tuple
 
+import matplotlib.pyplot as plt
+import numpy as np
 import torch
 from mmdet3d.registry import MODELS
+from mmengine.logging import print_log
 from torch import nn
 
 from .ops import bev_pool
@@ -164,6 +169,7 @@ def __init__(
         ybound: Tuple[float, float, float],
         zbound: Tuple[float, float, float],
         dbound: Tuple[float, float, float],
+        visualize_bev_feat: bool = False,
     ) -> None:
         super().__init__()
         self.in_channels = in_channels
@@ -183,6 +189,7 @@ def __init__(
         self.frustum = self.create_frustum()
         self.D = self.frustum.shape[0]
         self.fp16_enabled = False
+        self.visualize_bev_feat = visualize_bev_feat
 
     def create_frustum(self):
         iH, iW = self.image_size
@@ -319,8 +326,55 @@ def bev_pool_precomputed(self, x, geom_feats, kept, ranks, indices):
 
         # collapse Z
         final = torch.cat(x.unbind(dim=2), 1)
+        if self.visualize_bev_feat:
+            self.plot_bev_feat(final)
+
         return final
 
+    def plot_bev_feat(self, bev_feat):
+        """Visualize the BEV feat for the given batch index."""
+        try:
+            import torch.distributed as dist
+
+            if dist.is_available() and dist.is_initialized() and dist.get_rank() != 0:
+                return
+        except ImportError:
+            pass
+
+        batch_idx = 0
+        if bev_feat.shape[0] <= batch_idx:
+            return
+
+        # save first 10 raw channel maps for one batch sample (B, C, Y, X)
+        num_channels = 10
+        with torch.no_grad():
+            feat = bev_feat[batch_idx].detach().float().cpu().numpy()
+        channel_indices = np.arange(min(num_channels, feat.shape[0]))
+        ncols = min(5, len(channel_indices))
+        nrows = math.ceil(len(channel_indices) / ncols)
+        fig, axes = plt.subplots(nrows, ncols, figsize=(3 * ncols, 3 * nrows), squeeze=False)
+        for ax, ch_idx in zip(axes.ravel(), channel_indices):
+            ch_map = feat[ch_idx]
+            im = ax.imshow(ch_map, cmap="viridis", origin="lower", aspect="equal")
+            ax.set_title(f"ch {ch_idx}", fontsize=9)
+            ax.set_xlabel("X")
+            ax.set_ylabel("Y")
+            fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+        for ax in axes.ravel()[len(channel_indices) :]:
+            ax.axis("off")
+        fig.suptitle(f"bev_feat channels 0-{len(channel_indices) - 1} (batch={batch_idx})")
+        fig.tight_layout()
+
+        save_dir = Path("work_dirs/bev_feat_vis_2")
+        save_dir.mkdir(parents=True, exist_ok=True)
+        if not hasattr(self, "_bev_feat_vis_count"):
+            self._bev_feat_vis_count = 0
+        self._bev_feat_vis_count += 1
+        save_path = save_dir / f"bev_feat_batch{batch_idx}_{self._bev_feat_vis_count:06d}.png"
+        fig.savefig(save_path, dpi=150, bbox_inches="tight")
+        plt.close(fig)
+        print_log(f"Saved BEV feat visualization to {save_path.resolve()}")
+
     def forward(
         self,
         img,
diff --git a/projects/BEVFusion/bevfusion/depth_lss_v2.py b/projects/BEVFusion/bevfusion/depth_lss_v2.py
new file mode 100644
index 000000000..a95a88e4e
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/depth_lss_v2.py
@@ -0,0 +1,534 @@
+import math
+from pathlib import Path
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+from mmdet3d.registry import MODELS
+from mmengine.logging import print_log
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+from .depth_lss import BaseViewTransform, DepthLSSNet, DownSampleNet, LidarDepthImageNet
+from .ops import bev_pool_v2
+
+
+class SELayer(nn.Module):
+    """
+    Squeeze-and-Excitation (SE) layer.
+    This is used to modulate features with camera-depth aware parameters.
+    The code is taken from BEVDET (https://github.com/hustvl/BEVDET).
+    """
+
+    def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
+        super().__init__()
+        # Dont need global pooling because inputs are (B*N, C, 1, 1).
+        self.sequeeze_net = nn.Sequential(
+            # Squeeze with 1x1 convolution
+            nn.Conv2d(channels, channels, 1, bias=True),
+            # Activation
+            act_layer(),
+            # Expand with 1x1 convolution
+            nn.Conv2d(channels, channels, 1, bias=True),
+            # Gate with sigmoid activation
+            gate_layer(),
+        )
+
+    def forward(self, x: torch.Tensor, depth_aware_features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Tuple[torch.Tensor, torch.Tensor], the input tuple containing the image features and camera-depth aware parameters.
+        Returns:
+            torch.Tensor, the output tensor of shape (B, N, C).
+        """
+        feature_attentions = self.sequeeze_net(depth_aware_features)
+        return x * feature_attentions
+
+
+class CameraDepthLinearProjectionMLP(nn.Module):
+    """
+    Linear projection module by MLP. This is used to project image (context) features and camera-depth
+    aware parameters (for example, intrinsics) to embedding space.
+    The code is taken from BEVDET (https://github.com/hustvl/BEVDET).
+    """
+
+    def __init__(self, in_channels: int, hidden_channels: int, out_channels: int, drop_out: float = 0.0):
+        """
+        Args:
+            in_channels: int, the number of input channels.
+            hidden_channels: int, the number of hidden channels.
+            out_channels: int, the number of output channels.
+            drop_out: float, the dropout rate.
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.drop_out = drop_out
+
+        self.sequential_mlp = nn.Sequential(
+            nn.Linear(in_channels, hidden_channels),
+            nn.ReLU(inplace=True),
+            nn.Dropout(drop_out),
+            nn.Linear(hidden_channels, out_channels),
+            nn.Dropout(drop_out),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: torch.Tensor, the input tensor of shape (B, N, C).
+        Returns:
+            torch.Tensor, the output tensor of shape (B, N, C).
+        """
+        return self.sequential_mlp(x)
+
+
+class CameraDepthAwareNet(nn.Module):
+    """
+    Camera-depth aware depth net. This is used to predict the depth of the scene.
+    The code is taken from BEVDET (https://github.com/hustvl/BEVDET).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_channels: int,
+        out_channels: int,
+        mlp_drop_out: float,
+        depth_channels: int,
+        with_cp: bool = False,
+        num_camera_depth_parameters: int = 27,
+    ) -> None:
+        """
+        Args:
+            in_channels: int, the number of input channels.
+            out_channels: int, the number of output channels.
+            mlp_drop_out: float, the dropout rate of the MLP.
+            mlp_hidden_channels: int, the number of hidden channels of the MLP.
+            mlp_out_channels: int, the number of output channels of the MLP.
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.mlp_drop_out = mlp_drop_out
+        self.num_camera_depth_parameters = num_camera_depth_parameters
+        self.depth_channels = depth_channels
+        self.with_cp = with_cp
+
+        # Input convolution for context/image features
+        # Camera depth aware parameters branch
+        self.camera_depth_aware_parameters_bn = nn.BatchNorm1d(self.num_camera_depth_parameters)
+
+        # Context/image feature branch
+        self.context_input_conv = nn.Sequential(
+            nn.Conv2d(in_channels, hidden_channels, kernel_size=1, stride=1, bias=False),
+            nn.BatchNorm2d(hidden_channels),
+            nn.ReLU(inplace=True),
+        )
+        self.context_camera_depth_aware_mlp = CameraDepthLinearProjectionMLP(
+            in_channels=self.num_camera_depth_parameters,
+            hidden_channels=hidden_channels,
+            out_channels=hidden_channels,
+            drop_out=self.mlp_drop_out,
+        )
+        self.context_se = SELayer(channels=hidden_channels)
+        self.context_conv = nn.Conv2d(hidden_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True)
+
+        # Depth branch
+        self.depth_camera_depth_aware_mlp = CameraDepthLinearProjectionMLP(
+            in_channels=self.num_camera_depth_parameters,
+            hidden_channels=hidden_channels,
+            out_channels=hidden_channels,
+            drop_out=self.mlp_drop_out,
+        )
+        self.depth_se = SELayer(channels=hidden_channels)
+        self.depth_conv = nn.Sequential(
+            nn.Conv2d(hidden_channels, depth_channels, kernel_size=1, stride=1, padding=0, bias=True)
+        )
+
+    def context_forward(
+        self, context_features: torch.Tensor, camera_depth_aware_features: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: torch.Tensor, the input tensor of shape (B*N, C, H, W).
+            camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B*N, N_CAMERA_DEPTH_PARAMETERS).
+        Returns:
+            torch.Tensor, the output tensor of shape (B*N, C, H, W).
+        """
+        context_camera_depth_aware_features = self.context_camera_depth_aware_mlp(camera_depth_aware_features)
+        # # (B*N, mlp_out_channels) -> (B*N, mlp_out_channels, 1, 1)
+        context_camera_depth_aware_features = context_camera_depth_aware_features.view(-1, self.hidden_channels, 1, 1)
+        context_features = self.context_se(context_features, context_camera_depth_aware_features)
+        context_features = self.context_conv(context_features)
+        return context_features
+
+    def depth_forward(self, depth_features: torch.Tensor, camera_depth_aware_features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            depth_features: torch.Tensor, the input tensor of shape (B*N, C, H, W).
+            camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B, N, D).
+        Returns:
+            torch.Tensor, the output tensor of shape (B*N, C, H, W).
+        """
+        depth_camera_depth_aware_features = self.depth_camera_depth_aware_mlp(camera_depth_aware_features)
+        # # (B*N, mlp_out_channels) -> (B*N, mlp_out_channels, 1, 1)
+        depth_camera_depth_aware_features = depth_camera_depth_aware_features.view(-1, self.hidden_channels, 1, 1)
+        # # (B*N, C, H, W)
+        depth_features = self.depth_se(depth_features, depth_camera_depth_aware_features)
+        if self.with_cp:
+            depth_features = checkpoint(self.depth_conv, depth_features)
+        else:
+            depth_features = self.depth_conv(depth_features)
+        return depth_features
+
+    def forward(self, x: torch.Tensor, camera_depth_aware_parameters: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: torch.Tensor, the input tensor of shape (B, N, C, H, W).
+            camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B, N, N_CAMERA_DEPTH_PARAMETERS).
+        Returns:
+            torch.Tensor, the output tensor of shape (B*N, C, H, W).
+        """
+        # (B, N, N_CAMERA_DEPTH_PARAMETERS) -> (B*N, N_CAMERA_DEPTH_PARAMETERS)
+        camera_depth_aware_parameters = camera_depth_aware_parameters.view(-1, self.num_camera_depth_parameters)
+
+        # (B*N, N_CAMERA_DEPTH_PARAMETERS)
+        camera_depth_aware_features = self.camera_depth_aware_parameters_bn(camera_depth_aware_parameters)
+        context_input_features = self.context_input_conv(x)
+        context_features = self.context_forward(context_input_features, camera_depth_aware_features)
+        depth_features = self.depth_forward(context_input_features, camera_depth_aware_features)
+        return torch.cat([depth_features, context_features], dim=1)
+
+
+class BaseViewTransformV2(BaseViewTransform):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        image_size: Tuple[int, int],
+        feature_size: Tuple[int, int],
+        xbound: Tuple[float, float, float],
+        ybound: Tuple[float, float, float],
+        zbound: Tuple[float, float, float],
+        dbound: Tuple[float, float, float],
+        collapse_z: bool = True,
+        expand_batch_axis: bool = False,
+        visualize_bev_feat: bool = False,
+    ):
+        """
+        Args:
+            collapse_z: collapse the Z axis of the BEV grid
+            expand_batch_axis: expand the batch axis of the inputs to bev pool if this is set to True.
+        """
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            image_size=image_size,
+            feature_size=feature_size,
+            xbound=xbound,
+            ybound=ybound,
+            zbound=zbound,
+            dbound=dbound,
+            visualize_bev_feat=visualize_bev_feat,
+        )
+        self.collapse_z = collapse_z
+        self.expand_batch_axis = expand_batch_axis
+
+    def get_cam_feats(
+        self, x, camera_depth_aware_parameters: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+
+    def forward(
+        self,
+        img,
+        points,
+        lidar2image,
+        camera_intrinsics,
+        camera2lidar,
+        img_aug_matrix,
+        lidar_aug_matrix,
+        metas,
+        camera_intrinsics_inverse,
+        img_aug_matrix_inverse,
+        lidar_aug_matrix_inverse,
+        geom_feats_precomputed,
+        camera_depth_aware_parameters: Optional[torch.Tensor] = None,
+    ):
+        if geom_feats_precomputed is not None:
+            ranks_bev, ranks_depth, ranks_feat = geom_feats_precomputed
+            x, depth_softmax = self.get_cam_feats(img)
+            x = self.bev_pool_precomputed(x, depth_softmax, ranks_bev, ranks_depth, ranks_feat)
+
+            # No return depth predictions when precomputed geometry features are used
+            depth_softmax = None
+
+        else:
+            intrins = camera_intrinsics[..., :3, :3]
+            post_rots = img_aug_matrix[..., :3, :3]
+            post_trans = img_aug_matrix[..., :3, 3]
+            camera2lidar_rots = camera2lidar[..., :3, :3]
+            camera2lidar_trans = camera2lidar[..., :3, 3]
+
+            extra_rots = lidar_aug_matrix[..., :3, :3]
+            extra_trans = lidar_aug_matrix[..., :3, 3]
+
+            geom = self.get_geometry(
+                camera2lidar_rots,
+                camera2lidar_trans,
+                torch.inverse(intrins),
+                torch.inverse(post_rots),
+                post_trans,
+                extra_rots=extra_rots,
+                extra_trans=extra_trans,
+            )
+
+            # depth is not connected to the calibration
+            # on_img is
+            # is also flattened_indices
+            (
+                view_feats,
+                depth_softmax,
+            ) = self.get_cam_feats(img, camera_depth_aware_parameters)
+            x = self.bev_pool(view_feats, depth_softmax, geom)
+
+        return x, depth_softmax
+
+    def bev_pool_aux(self, geom_feats):
+        B, N, D, H, W, C = geom_feats.shape
+        Nprime = B * N * D * H * W
+        assert C == 3
+
+        # record the index of selected points for acceleration purpose
+        ranks_depth = torch.arange(0, Nprime, dtype=torch.int, device=geom_feats.device)
+        ranks_feat = torch.arange(0, Nprime // D, dtype=torch.int, device=geom_feats.device)
+        ranks_feat = ranks_feat.reshape(B, N, 1, H, W)
+        ranks_feat = ranks_feat.expand(B, N, D, H, W).flatten()
+
+        # flatten indices
+        geom_feats = ((geom_feats - (self.bx - self.dx / 2.0)) / self.dx).long()
+        geom_feats = geom_feats.view(Nprime, 3)
+        batch_ix = torch.cat(
+            [torch.full([Nprime // B, 1], ix, device=geom_feats.device, dtype=torch.long) for ix in range(B)]
+        )
+        geom_feats = torch.cat((geom_feats, batch_ix), 1)
+
+        # filter out points that are outside box
+        kept = (
+            (geom_feats[:, 0] >= 0)
+            & (geom_feats[:, 0] < self.nx[0])
+            & (geom_feats[:, 1] >= 0)
+            & (geom_feats[:, 1] < self.nx[1])
+            & (geom_feats[:, 2] >= 0)
+            & (geom_feats[:, 2] < self.nx[2])
+        )
+
+        if len(kept) == 0:
+            return None, None, None
+
+        geom_feats, ranks_depth, ranks_feat = geom_feats[kept], ranks_depth[kept], ranks_feat[kept]
+
+        # Switch x and y to match the order of the BEV grid
+        ranks_bev = (
+            geom_feats[:, 3] * (self.nx[2] * self.nx[1] * self.nx[0])
+            + geom_feats[:, 2] * (self.nx[1] * self.nx[0])
+            + geom_feats[:, 0] * self.nx[1]
+            + geom_feats[:, 1]
+        )
+        indices = ranks_bev.argsort()
+        ranks_bev, ranks_depth, ranks_feat = ranks_bev[indices], ranks_depth[indices], ranks_feat[indices]
+        return (
+            ranks_bev.int().contiguous(),
+            ranks_depth.int().contiguous(),
+            ranks_feat.int().contiguous(),
+        )
+
+    def compute_intervals(self, ranks_bev: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        if ranks_bev is None:
+            return None, None
+
+        kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
+        kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
+        interval_starts = torch.where(kept)[0].int()
+        if len(interval_starts) == 0:
+            return None, None
+
+        interval_lengths = torch.zeros_like(interval_starts)
+        interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
+        interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
+        return interval_starts.int().contiguous(), interval_lengths.int().contiguous()
+
+    def bev_pool(self, view_feats, depth_softmax, geom) -> torch.Tensor:
+        """ """
+        ranks_bev, ranks_depth, ranks_feat = self.bev_pool_aux(geom)
+        interval_starts, interval_lengths = self.compute_intervals(ranks_bev)
+        bev_feat = self.compute_bev_pool(
+            view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths
+        )
+        return bev_feat
+
+    def compute_bev_pool(
+        self, view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths
+    ):
+        """Compute the BEV pool for the given view features, depth softmax, ranks, and intervals."""
+        if interval_starts is None:
+            print_log("warning ---> no points within the predefined bev receptive field")
+            dummy = torch.zeros(
+                size=[view_feats.shape[0], view_feats.shape[2], self.nx[2], self.nx[1], self.nx[0]],
+                dtype=view_feats.dtype,
+                device=view_feats.device,
+            )
+            if self.collapse_z:
+                dummy = torch.cat(dummy.unbind(dim=2), 1)
+            return dummy
+
+        if self.expand_batch_axis:
+            view_feats = view_feats.unsqueeze(0)
+            depth_softmax = depth_softmax.unsqueeze(0)
+
+        # permute view_feats from (B, N, C, fH, fW) to (B, N, fH, fW, C)
+        view_feats = view_feats.permute(0, 1, 3, 4, 2)
+        bev_feat_shape = (
+            depth_softmax.shape[0],
+            int(self.nx[2]),
+            int(self.nx[1]),
+            int(self.nx[0]),
+            view_feats.shape[-1],
+        )  # (B, Z, Y, X, C)
+        bev_feat = bev_pool_v2(
+            depth=depth_softmax,
+            feat=view_feats,
+            ranks_depth=ranks_depth,
+            ranks_feat=ranks_feat,
+            ranks_bev=ranks_bev,
+            interval_starts=interval_starts,
+            interval_lengths=interval_lengths,
+            bev_feat_shape=bev_feat_shape,
+            is_training=self.training,
+        )
+
+        # collapse Z
+        if self.collapse_z:
+            bev_feat = torch.cat(bev_feat.unbind(dim=2), 1)
+
+        if self.visualize_bev_feat:
+            self.plot_bev_feat(bev_feat)
+
+        return bev_feat
+
+    def bev_pool_precomputed(self, view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat):
+        interval_starts, interval_lengths = self.compute_intervals(ranks_bev)
+        bev_feat = self.compute_bev_pool(
+            view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths
+        )
+        return bev_feat
+
+    def get_depth_softmax(self, x: torch.Tensor, B, N, fH, fW) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x: torch.Tensor, the input tensor of shape (B*N, D+C, H, W).
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor], the tuple containing the view features and depth softmax.
+            view_feats: torch.Tensor, the view features of shape (B, N, C, H, W).
+            depth_softmax: torch.Tensor, the depth softmax of shape (B, N, D, H, W).
+        """
+        depth_softmax = x[:, : self.D].softmax(dim=1)
+        depth_softmax = depth_softmax.view(B, N, self.D, fH, fW)
+        view_feats = x[:, self.D : (self.D + self.C)]
+        view_feats = view_feats.view(B, N, self.C, fH, fW)
+        return view_feats, depth_softmax
+
+
+@MODELS.register_module()
+class LSSTransformV2(BaseViewTransformV2):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        image_size: Tuple[int, int],
+        feature_size: Tuple[int, int],
+        xbound: Tuple[float, float, float],
+        ybound: Tuple[float, float, float],
+        zbound: Tuple[float, float, float],
+        dbound: Tuple[float, float, float],
+        downsample: int = 1,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            image_size=image_size,
+            feature_size=feature_size,
+            xbound=xbound,
+            ybound=ybound,
+            zbound=zbound,
+            dbound=dbound,
+        )
+        self.depthnet = nn.Conv2d(self.in_channels, self.D + self.C, 1)
+        self.downsample = DownSampleNet(downsample, out_channels, out_channels)
+
+    def get_cam_feats(
+        self, x: torch.Tensor, camera_depth_aware_parameters: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, N, C, fH, fW = x.shape
+        x = x.view(B * N, C, fH, fW)
+        x = self.depthnet(x)
+        return self.get_depth_softmax(x, B=B, N=N, fH=fH, fW=fW)
+
+    def forward(self, *args, **kwargs):
+        x, depth_softmax = super().forward(*args, **kwargs)
+        x = self.downsample(x)
+        return x, depth_softmax
+
+
+@MODELS.register_module()
+class LSSTransformV2DepthAware(BaseViewTransformV2):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        image_size: Tuple[int, int],
+        feature_size: Tuple[int, int],
+        xbound: Tuple[float, float, float],
+        ybound: Tuple[float, float, float],
+        zbound: Tuple[float, float, float],
+        dbound: Tuple[float, float, float],
+        camera_depth_aware_configs: dict,
+        downsample: int = 1,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            image_size=image_size,
+            feature_size=feature_size,
+            xbound=xbound,
+            ybound=ybound,
+            zbound=zbound,
+            dbound=dbound,
+        )
+        self.downsample = DownSampleNet(downsample, out_channels, out_channels)
+        self.camera_depth_aware_net = CameraDepthAwareNet(
+            in_channels=in_channels,
+            hidden_channels=in_channels,
+            mlp_drop_out=camera_depth_aware_configs["mlp_drop_out"],
+            depth_channels=self.D,
+            out_channels=self.C,
+        )
+
+    def get_cam_feats(
+        self, x: torch.Tensor, camera_depth_aware_parameters: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, N, C, fH, fW = x.shape
+        x = x.view(B * N, C, fH, fW)
+        x = self.camera_depth_aware_net(x, camera_depth_aware_parameters)
+        return self.get_depth_softmax(x, B=B, N=N, fH=fH, fW=fW)
+
+    def forward(self, *args, **kwargs):
+        x, depth_softmax = super().forward(*args, **kwargs)
+        x = self.downsample(x)
+        return x, depth_softmax
diff --git a/projects/BEVFusion/bevfusion/loading.py b/projects/BEVFusion/bevfusion/loading.py
index 0478d67a3..7f3322fda 100644
--- a/projects/BEVFusion/bevfusion/loading.py
+++ b/projects/BEVFusion/bevfusion/loading.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
-import os
-from typing import List, Optional
+from pathlib import Path
+from typing import List, Optional, Tuple
 
+import matplotlib.pyplot as plt
 import mmcv
 import numpy as np
+from mmcv.transforms import BaseTransform
 from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles
 from mmdet3d.registry import TRANSFORMS
 from mmengine.fileio import get
@@ -217,3 +219,219 @@ def transform(self, results: dict) -> Optional[dict]:
         results["num_views"] = self.num_views
         results["num_ref_frames"] = self.num_ref_frames
         return results
+
+
+@TRANSFORMS.register_module()
+class PointsToMultiViewImageDepths(BaseTransform):
+    """Convert points to multi-view image depths.
+
+    Args:
+        points (np.ndarray): Points in the world coordinate system.
+        img_shape (tuple): Shape of the image.
+        cam2img (np.ndarray): Camera to image transformation matrix.
+        lidar2cam (np.ndarray): LiDAR to camera transformation matrix.
+        visualize_dir (str, optional): If set, saves a per-sample subplot
+            of `gt_depths` (one panel per camera) to this directory.
+            Useful for debugging the projection. Defaults to None.
+        max_depth (float): Upper clip for the depth color scale (m).
+            Defaults to 80.
+    """
+
+    def __init__(
+        self,
+        img_shape,
+        num_cameras: int,
+        depth_bounds: Tuple[float, float],
+        visualize_dir: Optional[str] = None,
+        max_depth: float = 80.0,
+    ):
+        self.img_shape = img_shape
+        self.num_cameras = num_cameras
+        self.visualize_dir = visualize_dir
+        self.max_depth = max_depth
+        self.depth_bounds = depth_bounds
+        self.visualize_dir = Path(visualize_dir) if visualize_dir is not None else None
+        if self.visualize_dir is not None:
+            self.visualize_dir.mkdir(parents=True, exist_ok=True)
+        self._depth_idx = 0
+
+    def transform(self, results: dict) -> Optional[dict]:
+        """Call function to load multi-view image from files.
+
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+
+        Returns:
+            dict: The result dict containing the multi-view image data.
+            Added keys:
+                - gt_depths (np.ndarray): Ground truth depths in (N, H, W) for (number of cameras, height, width).
+        """
+        lidar2image = np.asarray(results["lidar2img"])
+        img_aug_matrix = np.asarray(results["img_aug_matrix"]) if "img_aug_matrix" in results else np.eye(4)
+        cur_coords = results["points"].numpy()[:, :3]
+
+        # inverse lidar aug
+        if "lidar_aug_matrix" in results:
+            lidar_aug_matrix = np.asarray(results["lidar_aug_matrix"])
+            lidar_aug_matrix_inverse = np.linalg.inv(lidar_aug_matrix)
+            cur_coords -= lidar_aug_matrix[:3, 3]
+            cur_coords = lidar_aug_matrix_inverse[:3, :3] @ cur_coords.transpose(1, 0)
+        else:
+            cur_coords = cur_coords.transpose(1, 0)
+
+        # lidar2image
+        cur_coords = lidar2image[:, :3, :3] @ cur_coords
+        cur_coords += lidar2image[:, :3, 3].reshape(-1, 3, 1)
+
+        # get 2d coords
+        dist = cur_coords[:, 2, :]
+        valid_dist_mask = (dist >= self.depth_bounds[0]) & (dist < self.depth_bounds[1])
+
+        cur_coords[:, 2, :] = np.clip(cur_coords[:, 2, :], 1e-5, 1e5)
+        cur_coords[:, :2, :] /= cur_coords[:, 2:3, :]
+
+        # imgaug
+        cur_coords = img_aug_matrix[:, :3, :3] @ cur_coords
+        cur_coords += img_aug_matrix[:, :3, 3].reshape(-1, 3, 1)
+        cur_coords = cur_coords[:, :2, :].transpose(0, 2, 1)
+
+        # normalize coords for grid sample
+        cur_coords = cur_coords[..., [1, 0]]
+        on_img = (
+            (cur_coords[..., 0] < self.img_shape[0])
+            & (cur_coords[..., 0] >= 0)
+            & (cur_coords[..., 1] < self.img_shape[1])
+            & (cur_coords[..., 1] >= 0)
+            & valid_dist_mask
+        )
+
+        # Avoid loops since it's slow
+        indices = np.nonzero(on_img)
+        camera_indices = indices[0]
+        point_indices = indices[1]
+        masked_coords = cur_coords[camera_indices, point_indices].astype(np.int64)
+        masked_dist = dist[camera_indices, point_indices]
+
+        # Possibly to have duplicates and the last one will be used, however, the chance is small
+        flatten_indices = (
+            camera_indices * self.img_shape[0] * self.img_shape[1]
+            + masked_coords[:, 0] * self.img_shape[1]
+            + masked_coords[:, 1]
+        )
+        depth_flat = np.zeros(self.num_cameras * self.img_shape[0] * self.img_shape[1], dtype=np.float32)
+        depth_flat[flatten_indices] = masked_dist
+        depth = depth_flat.reshape(self.num_cameras, self.img_shape[0], self.img_shape[1])
+        results["gt_depths"] = depth
+
+        if self.visualize_dir is not None:
+            self._save_depth_subplot(depth, results)
+        return results
+
+    def _save_depth_subplot(self, depth: np.ndarray, results: dict) -> None:
+        """Save `gt_depths` as a subplot with one panel per camera.
+
+        The figure contains three row blocks per camera:
+        - image underlay (if available) + projected LiDAR depth points
+        - image pixels only
+        - depth-only heatmap (no image pixel values)
+
+        Args:
+            depth (np.ndarray): (num_cameras, H, W) ground-truth depth map.
+            results (dict): The pipeline result dict; used for the underlay
+                image and to derive a unique filename.
+        """
+        imgs = results.get("img", None)
+
+        # Layout:
+        # - Top block: image underlay + projected depth points.
+        # - Middle block: image pixels only.
+        # - Bottom block: depth-only heatmap (no image pixel values).
+        if self.num_cameras <= 6:
+            base_rows, cols = 1, self.num_cameras
+        else:
+            cols = int(np.ceil(np.sqrt(self.num_cameras)))
+            base_rows = int(np.ceil(self.num_cameras / cols))
+        rows = base_rows * 3
+
+        fig, axes = plt.subplots(rows, cols, figsize=(4 * cols, 4 * rows), squeeze=False)
+
+        for c in range(self.num_cameras):
+            d = depth[c]
+            ys, xs = np.nonzero(d)
+            vals = d[ys, xs]
+
+            # Row block 1: image + depth scatter.
+            ax_overlay = axes[c // cols, c % cols]
+            if imgs is not None and c < len(imgs):
+                ax_overlay.imshow(imgs[c].astype(np.uint8))
+                if vals.size > 0:
+                    ax_overlay.scatter(
+                        xs,
+                        ys,
+                        c=vals,
+                        cmap="turbo",
+                        vmin=0,
+                        vmax=self.max_depth,
+                        s=1,
+                    )
+            else:
+                ax_overlay.imshow(
+                    d,
+                    cmap="turbo",
+                    vmin=0,
+                    vmax=self.max_depth,
+                    interpolation="nearest",
+                )
+            ax_overlay.set_title(f"cam {c} overlay  ({vals.size} pts)")
+            ax_overlay.set_xticks([])
+            ax_overlay.set_yticks([])
+
+            # Row block 2: image-only visualization.
+            ax_img = axes[base_rows + (c // cols), c % cols]
+            if imgs is not None and c < len(imgs):
+                ax_img.imshow(imgs[c].astype(np.uint8))
+            else:
+                ax_img.imshow(
+                    d,
+                    cmap="gray",
+                    vmin=0,
+                    vmax=self.max_depth,
+                    interpolation="nearest",
+                )
+            ax_img.set_title(f"cam {c} image-only")
+            ax_img.set_xticks([])
+            ax_img.set_yticks([])
+
+            # Row block 3: depth-only visualization.
+            ax_depth = axes[(base_rows * 2) + (c // cols), c % cols]
+            ax_depth.imshow(
+                d,
+                cmap="turbo",
+                vmin=0,
+                vmax=self.max_depth,
+                interpolation="nearest",
+            )
+            ax_depth.set_title(f"cam {c} depth-only")
+            ax_depth.set_xticks([])
+            ax_depth.set_yticks([])
+
+        # Hide any unused subplots when n doesn't fill the grid.
+        for c in range(self.num_cameras, base_rows * cols):
+            axes[c // cols, c % cols].axis("off")
+            axes[base_rows + (c // cols), c % cols].axis("off")
+            axes[(base_rows * 2) + (c // cols), c % cols].axis("off")
+
+        # Shared depth colorbar with numeric values.
+        depth_mappable = plt.cm.ScalarMappable(cmap="turbo", norm=plt.Normalize(vmin=0, vmax=self.max_depth))
+        depth_mappable.set_array([])
+        cbar = fig.colorbar(depth_mappable, ax=axes, location="right", fraction=0.02, pad=0.02)
+        cbar.set_label("Depth (m)")
+
+        fig.suptitle(f"gt_depths — {self._depth_idx}")
+        fig.tight_layout(rect=[0, 0, 0.96, 0.97])
+
+        self._depth_idx += 1
+        out_path = self.visualize_dir / f"{self._depth_idx:06d}_gt_depths.png"
+        fig.savefig(out_path, dpi=120, bbox_inches="tight")
+        plt.close(fig)
+        print(f"Saved gt_depths visualization to {out_path}")
diff --git a/projects/BEVFusion/bevfusion/ops/__init__.py b/projects/BEVFusion/bevfusion/ops/__init__.py
index e08abbc6d..f74f0edbb 100644
--- a/projects/BEVFusion/bevfusion/ops/__init__.py
+++ b/projects/BEVFusion/bevfusion/ops/__init__.py
@@ -1,4 +1,12 @@
 from .bev_pool import bev_pool
+from .bev_pool_v2 import bev_pool_v2
 from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization
 
-__all__ = ["bev_pool", "Voxelization", "voxelization", "dynamic_scatter", "DynamicScatter"]
+__all__ = [
+    "bev_pool",
+    "bev_pool_v2",
+    "Voxelization",
+    "voxelization",
+    "dynamic_scatter",
+    "DynamicScatter",
+]
diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py
new file mode 100644
index 000000000..ff2fdfff7
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py
@@ -0,0 +1,3 @@
+from .bev_pool_v2 import bev_pool_v2
+
+__all__ = ["bev_pool_v2"]
diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py
new file mode 100644
index 000000000..af1ba15de
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py
@@ -0,0 +1,191 @@
+# Copyright (c) Phigent Robotics. All rights reserved.
+
+import numpy as np
+import torch
+
+from . import bev_pool_v2_ext
+
+
+class QuickCumsumV2TrainingCuda(torch.autograd.Function):
+    r"""BEVPoolv2 implementation for Lift-Splat-Shoot view transformation.
+
+    Please refer to the `paper <https://arxiv.org/abs/2211.17111>`_
+    """
+
+    @staticmethod
+    def forward(
+        ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths
+    ):
+        ranks_bev = ranks_bev.int()
+        depth = depth.contiguous().float()
+        feat = feat.contiguous().float()
+        ranks_depth = ranks_depth.contiguous().int()
+        ranks_feat = ranks_feat.contiguous().int()
+        interval_lengths = interval_lengths.contiguous().int()
+        interval_starts = interval_starts.contiguous().int()
+
+        out = feat.new_zeros(bev_feat_shape)
+
+        bev_pool_v2_ext.bev_pool_v2_forward(
+            depth,
+            feat,
+            out,
+            ranks_depth,
+            ranks_feat,
+            ranks_bev,
+            interval_lengths,
+            interval_starts,
+        )
+
+        ctx.save_for_backward(ranks_bev, depth, feat, ranks_feat, ranks_depth)
+        return out
+
+    @staticmethod
+    def backward(ctx, out_grad):
+        ranks_bev, depth, feat, ranks_feat, ranks_depth = ctx.saved_tensors
+
+        order = ranks_feat.argsort()
+        ranks_feat, ranks_depth, ranks_bev = ranks_feat[order], ranks_depth[order], ranks_bev[order]
+        kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
+        kept[1:] = ranks_feat[1:] != ranks_feat[:-1]
+        interval_starts_bp = torch.where(kept)[0].int()
+        interval_lengths_bp = torch.zeros_like(interval_starts_bp)
+        interval_lengths_bp[:-1] = interval_starts_bp[1:] - interval_starts_bp[:-1]
+        interval_lengths_bp[-1] = ranks_bev.shape[0] - interval_starts_bp[-1]
+
+        depth = depth.contiguous()
+        feat = feat.contiguous()
+        ranks_depth = ranks_depth.contiguous()
+        ranks_feat = ranks_feat.contiguous()
+        ranks_bev = ranks_bev.contiguous()
+        interval_lengths_bp = interval_lengths_bp.contiguous()
+        interval_starts_bp = interval_starts_bp.contiguous()
+
+        depth_grad = depth.new_zeros(depth.shape)
+        feat_grad = feat.new_zeros(feat.shape)
+        out_grad = out_grad.contiguous()
+        bev_pool_v2_ext.bev_pool_v2_backward(
+            out_grad,
+            depth_grad,
+            feat_grad,
+            depth,
+            feat,
+            ranks_depth,
+            ranks_feat,
+            ranks_bev,
+            interval_lengths_bp,
+            interval_starts_bp,
+        )
+        return depth_grad, feat_grad, None, None, None, None, None, None, None, None
+
+
+class QuickCumsumV2Cuda(torch.autograd.Function):
+
+    @staticmethod
+    def symbolic(
+        g,
+        depth,
+        feat,
+        ranks_depth,
+        ranks_feat,
+        ranks_bev,
+        interval_starts,
+        interval_lengths,
+        out_height=128,
+        out_width=128,
+    ):
+        """symbolic function for creating onnx op."""
+        x = g.op(
+            "autoware::QuickCumsumV2Cuda",
+            depth,
+            feat,
+            ranks_depth,
+            ranks_feat,
+            ranks_bev,
+            interval_starts,
+            interval_lengths,
+            out_height_i=out_height,
+            out_width_i=out_width,
+        )
+
+        # features_shape = _get_tensor_sizes(feat)
+        # if features_shape is not None and hasattr(x.type(), "with_sizes"):
+        #     output_type = x.type().with_sizes([B, D, H, W, _get_tensor_dim_size(x, -1)])
+        #     output.setType(output_type)
+
+    @staticmethod
+    def forward(
+        ctx,
+        depth,  # B,N,D,H,W
+        feat,  # B,N,H,W,C
+        ranks_depth,
+        ranks_feat,
+        ranks_bev,
+        interval_starts,
+        interval_lengths,
+        out_height=128,
+        out_width=128,
+    ):
+        """run forward."""
+        out = feat.new_zeros(depth.shape[0], 1, out_height, out_width, feat.shape[-1])
+        bev_feat = bev_pool_v2_ext.bev_pool_v2_forward(
+            depth,
+            feat,
+            out,
+            ranks_depth,
+            ranks_feat,
+            ranks_bev,
+            interval_lengths,
+            interval_starts,
+        )
+        return bev_feat
+
+    @staticmethod
+    def backward(ctx, out_grad):
+        raise NotImplementedError
+
+
+def bev_pool_v2(
+    depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, bev_feat_shape, is_training
+):
+    # Always use full (B, Z, H, W, C) buffer; QuickCumsumV2Cuda (Z=1) is ONNX-only.
+    del is_training
+    x = QuickCumsumV2TrainingCuda.apply(
+        depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths
+    )
+
+    # Final shape: (B, C, Z, H, W) — matches LSSTransform v1 after permute
+    x = x.permute(0, 4, 1, 2, 3).contiguous()
+    return x
+
+
+def test_bev_pool_v2():
+    depth = np.array([0.3, 0.4, 0.2, 0.1, 0.7, 0.6, 0.8, 0.9])
+    depth = torch.from_numpy(depth).float().cuda()
+    depth = depth.view(1, 1, 2, 2, 2).requires_grad_()
+    feat = torch.ones(size=[1, 1, 2, 2, 2], dtype=torch.float, device="cuda").requires_grad_()
+    ranks_depth = torch.from_numpy(np.array([0, 4, 1, 6])).int().cuda()
+    ranks_feat = torch.from_numpy(np.array([0, 0, 1, 2])).int().cuda()
+    ranks_bev = torch.from_numpy(np.array([0, 0, 1, 1])).int().cuda()
+
+    kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
+    kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
+    interval_starts = torch.where(kept)[0].int()
+    if len(interval_starts) == 0:
+        return None, None, None, None, None
+    interval_lengths = torch.zeros_like(interval_starts)
+    interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
+    interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
+    bev_feat = bev_pool_v2(
+        depth, feat, ranks_depth, ranks_feat, ranks_bev, (1, 1, 2, 2, 2), interval_starts, interval_lengths
+    )
+    loss = torch.sum(bev_feat)
+    loss.backward()
+    assert loss == 4.4
+    grad_depth = np.array([2.0, 2.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0])
+    grad_depth = torch.from_numpy(grad_depth).float()
+    grad_depth = grad_depth.cuda().view(1, 1, 2, 2, 2)
+    assert depth.grad.allclose(grad_depth)
+    grad_feat = np.array([1.0, 1.0, 0.4, 0.4, 0.8, 0.8, 0.0, 0.0])
+    grad_feat = torch.from_numpy(grad_feat).float().cuda().view(1, 1, 2, 2, 2)
+    assert feat.grad.allclose(grad_feat)
diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp
new file mode 100644
index 000000000..c7c38f695
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) Phigent Robotics. All rights reserved.
+// Reference https://arxiv.org/abs/2211.17111
+#include <torch/torch.h>
+#include <c10/cuda/CUDAGuard.h>
+
+// CUDA function declarations
+void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat,
+    const int* ranks_depth, const int* ranks_feat, const int* ranks_bev,
+    const int* interval_starts, const int* interval_lengths, float* out);
+
+void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad,
+  const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat,
+  const int* ranks_bev, const int* interval_starts, const int* interval_lengths,
+  float* depth_grad, float* feat_grad);
+
+
+/*
+  Function: pillar pooling (forward, cuda)
+  Args:
+    depth            : input depth, FloatTensor[n, d, h, w]
+    feat             : input features, FloatTensor[n, h, w, c]
+    out              : output features, FloatTensor[b, c, h_out, w_out]
+    ranks_depth      : depth index of points, IntTensor[n_points]
+    ranks_feat       : feat index of points, IntTensor[n_points]
+    ranks_bev        : output index of points, IntTensor[n_points]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+  Return:
+*/
+void bev_pool_v2_forward(
+  const at::Tensor _depth,
+  const at::Tensor _feat,
+  at::Tensor _out,
+  const at::Tensor _ranks_depth,
+  const at::Tensor _ranks_feat,
+  const at::Tensor _ranks_bev,
+  const at::Tensor _interval_lengths,
+  const at::Tensor _interval_starts
+) {
+  int c = _feat.size(4);
+  int n_intervals = _interval_lengths.size(0);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_depth));
+  const float* depth = _depth.data_ptr<float>();
+  const float* feat = _feat.data_ptr<float>();
+  const int* ranks_depth = _ranks_depth.data_ptr<int>();
+  const int* ranks_feat = _ranks_feat.data_ptr<int>();
+  const int* ranks_bev = _ranks_bev.data_ptr<int>();
+
+  const int* interval_lengths = _interval_lengths.data_ptr<int>();
+  const int* interval_starts = _interval_starts.data_ptr<int>();
+
+  float* out = _out.data_ptr<float>();
+  bev_pool_v2(
+    c, n_intervals, depth, feat, ranks_depth, ranks_feat,
+    ranks_bev, interval_starts, interval_lengths, out
+  );
+}
+
+
+/*
+  Function: pillar pooling (backward, cuda)
+  Args:
+    out_grad         : grad of output bev feature, FloatTensor[b, c, h_out, w_out]
+    depth_grad       : grad of input depth, FloatTensor[n, d, h, w]
+    feat_grad        : grad of input feature, FloatTensor[n, h, w, c]
+    depth            : input depth, FloatTensor[n, d, h, w]
+    feat             : input features, FloatTensor[n, h, w, c]
+    ranks_depth      : depth index of points, IntTensor[n_points]
+    ranks_feat       : feat index of points, IntTensor[n_points]
+    ranks_bev        : output index of points, IntTensor[n_points]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+*/
+void bev_pool_v2_backward(
+  const at::Tensor _out_grad,
+  at::Tensor _depth_grad,
+  at::Tensor _feat_grad,
+  const at::Tensor _depth,
+  const at::Tensor _feat,
+  const at::Tensor _ranks_depth,
+  const at::Tensor _ranks_feat,
+  const at::Tensor _ranks_bev,
+  const at::Tensor _interval_lengths,
+  const at::Tensor _interval_starts
+) {
+  int c = _out_grad.size(4);
+  int n_intervals = _interval_lengths.size(0);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad));
+  const float* out_grad = _out_grad.data_ptr<float>();
+  float* depth_grad = _depth_grad.data_ptr<float>();
+  float* feat_grad = _feat_grad.data_ptr<float>();
+  const float* depth = _depth.data_ptr<float>();
+  const float* feat = _feat.data_ptr<float>();
+  const int* ranks_depth = _ranks_depth.data_ptr<int>();
+  const int* ranks_feat = _ranks_feat.data_ptr<int>();
+  const int* ranks_bev = _ranks_bev.data_ptr<int>();
+  const int* interval_lengths = _interval_lengths.data_ptr<int>();
+  const int* interval_starts = _interval_starts.data_ptr<int>();
+
+  bev_pool_v2_grad(
+    c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat,
+    ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad
+  );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("bev_pool_v2_forward", &bev_pool_v2_forward,
+        "bev_pool_v2_forward");
+  m.def("bev_pool_v2_backward", &bev_pool_v2_backward,
+        "bev_pool_v2_backward");
+}
diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu
new file mode 100644
index 000000000..7fa3179b7
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu
@@ -0,0 +1,140 @@
+// Copyright (c) Phigent Robotics. All rights reserved.
+// Reference https://arxiv.org/abs/2211.17111
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+  Function: pillar pooling
+  Args:
+    c                : number of channels
+    n_intervals      : number of unique points
+    depth            : input depth, FloatTensor[b,n,d,h,w]
+    feat             : input feat, FloatTensor[b,n,h,w,c]
+    ranks_depth      : input index of depth, IntTensor[n]
+    ranks_feat       : input index of feat, IntTensor[n]
+    ranks_bev        : output index, IntTensor[n]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+    out              : output features, FloatTensor[b, d, h, w, c]
+*/
+__global__ void bev_pool_v2_kernel(int c, int n_intervals,
+                                  const float *__restrict__ depth,
+                                  const float *__restrict__ feat,
+                                  const int *__restrict__ ranks_depth,
+                                  const int *__restrict__ ranks_feat,
+                                  const int *__restrict__ ranks_bev,
+                                  const int *__restrict__ interval_starts,
+                                  const int *__restrict__ interval_lengths,
+                                  float* __restrict__ out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = idx / c;
+  int cur_c = idx % c;
+  if (index >= n_intervals) return;
+  int interval_start = interval_starts[index];
+  int interval_length = interval_lengths[index];
+  float psum = 0;
+  const float* cur_depth;
+  const float* cur_feat;
+  for(int i = 0; i < interval_length; i++){
+    cur_depth = depth + ranks_depth[interval_start+i];
+    cur_feat = feat + ranks_feat[interval_start+i] * c + cur_c;
+    psum += *cur_feat * *cur_depth;
+  }
+
+  const int* cur_rank = ranks_bev + interval_start;
+  float* cur_out = out + *cur_rank * c + cur_c;
+  *cur_out = psum;
+}
+
+
+/*
+  Function: pillar pooling backward
+  Args:
+    c                : number of channels
+    n_intervals      : number of unique points
+    out_grad         : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c]
+    depth            : input depth, FloatTensor[b,n,d,h,w]
+    feat             : input feat, FloatTensor[b,n,h,w,c]
+    ranks_depth      : input index of depth, IntTensor[n]
+    ranks_feat       : input index of feat, IntTensor[n]
+    ranks_bev        : output index, IntTensor[n]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+    depth_grad       : gradient of the depth fmap, FloatTensor
+    feat_grad        : gradient of the feature fmap, FloatTensor
+*/
+__global__ void bev_pool_grad_kernel(int c, int n_intervals,
+                                  const float *__restrict__ out_grad,
+                                  const float *__restrict__ depth,
+                                  const float *__restrict__ feat,
+                                  const int *__restrict__ ranks_depth,
+                                  const int *__restrict__ ranks_feat,
+                                  const int *__restrict__ ranks_bev,
+                                  const int *__restrict__ interval_starts,
+                                  const int *__restrict__ interval_lengths,
+                                  float* __restrict__ depth_grad,
+                                  float* __restrict__ feat_grad) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= n_intervals) return;
+  int interval_start = interval_starts[idx];
+  int interval_length = interval_lengths[idx];
+
+  const int* cur_rank;
+  const float* cur_out_grad;
+  const float* cur_out_grad_start;
+
+  const float* cur_feat;
+  const float* cur_feat_start;
+  float* cur_depth_grad;
+  float grad_sum;
+  for(int i = 0; i < interval_length; i++){
+    cur_rank = ranks_bev + interval_start + i;
+    cur_out_grad_start = out_grad +  * cur_rank * c;
+    cur_feat_start = feat + ranks_feat[interval_start+i] * c;
+
+    grad_sum = 0;
+    for(int cur_c = 0; cur_c < c; cur_c++){
+      cur_out_grad = cur_out_grad_start + cur_c;
+      cur_feat = cur_feat_start + cur_c;
+      grad_sum += *cur_out_grad * *cur_feat;
+    }
+
+    cur_depth_grad = depth_grad + ranks_depth[interval_start+i];
+    *cur_depth_grad = grad_sum;
+  }
+
+  float* cur_feat_grad;
+  const float* cur_depth;
+  for(int cur_c = 0; cur_c < c; cur_c++){
+    grad_sum = 0;
+    for(int i = 0; i < interval_length; i++){
+      cur_rank = ranks_bev + interval_start + i;
+      cur_out_grad = out_grad + *cur_rank * c + cur_c;
+
+      cur_depth = depth + ranks_depth[interval_start+i];
+      grad_sum += *cur_out_grad * *cur_depth;
+    }
+    cur_feat_grad = feat_grad + ranks_feat[interval_start] * c + cur_c ;
+    * cur_feat_grad = grad_sum;
+  }
+}
+
+
+
+void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat, const int* ranks_depth,
+  const int* ranks_feat, const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* out) {
+  bev_pool_v2_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
+    c, n_intervals, depth, feat, ranks_depth, ranks_feat,
+    ranks_bev, interval_starts, interval_lengths, out
+  );
+}
+
+void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad,
+  const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat,
+  const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* depth_grad, float* feat_grad) {
+  bev_pool_grad_kernel<<<(int)ceil(((double)n_intervals / 256)), 256>>>(
+     c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat,
+     ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad
+  );
+}
diff --git a/projects/BEVFusion/bevfusion/ops/topk/__init__.py b/projects/BEVFusion/bevfusion/ops/topk/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/projects/BEVFusion/bevfusion/ops/topk/topk.py b/projects/BEVFusion/bevfusion/ops/topk/topk.py
new file mode 100644
index 000000000..f0f9a8779
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/ops/topk/topk.py
@@ -0,0 +1,46 @@
+"""
+This file is used to write functions to deploy custom plugins to support Autoware, for example, TopK.
+"""
+
+import torch
+from torch.autograd import Function
+from torch.onnx.symbolic_helper import _get_tensor_sizes
+
+
+class TopK(Function):
+
+    @staticmethod
+    def symbolic(
+        g,
+        x: torch.Tensor,
+        k: int,
+        dim: int,
+        sorted: bool = False,
+    ):
+
+        output = g.op(
+            "autoware::Argsort",
+            x,
+            outputs=1,
+        )
+        x_shape = _get_tensor_sizes(x)
+        if x_shape is not None and hasattr(output.type(), "with_sizes"):
+            output_type = x.type().with_sizes(x_shape)
+            output.setType(output_type)
+        # Argsort from Autoware is in ascending order, so we need to return the last k elements.
+        return output[-k:]
+
+    @staticmethod
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        k: int,
+        dim: int,
+        sorted: bool = False,
+    ):
+        _, indices = torch.topk(x, k=k, dim=dim, largest=True, sorted=sorted)
+        return indices
+
+
+def topk(x: torch.Tensor, k: int, dim: int, sorted: bool = False):
+    return TopK.apply(x, k, dim, sorted)
diff --git a/projects/BEVFusion/bevfusion/sparse_encoder.py b/projects/BEVFusion/bevfusion/sparse_encoder.py
index 019cb630c..6bf0592b6 100644
--- a/projects/BEVFusion/bevfusion/sparse_encoder.py
+++ b/projects/BEVFusion/bevfusion/sparse_encoder.py
@@ -1,4 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+
+import os
+from typing import Dict, Optional
+
+import numpy as np
+import torch
 from mmdet3d.models.layers import make_sparse_convmodule
 from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
 from mmdet3d.models.middle_encoders import SparseEncoder
@@ -9,8 +15,7 @@
 else:
     from mmcv.ops import SparseConvTensor
 
-import numpy as np
-import torch
+from .custom_sparse_conv_tensor import sparse_to_dense
 
 
 @MODELS.register_module()
@@ -24,6 +29,7 @@ class BEVFusionSparseEncoder(SparseEncoder):
     Args:
         in_channels (int): The number of input channels.
         sparse_shape (list[int]): The sparse shape of input tensor.
+        dense_output_shape (list[int]): The final shape of the dense output tensor.
         order (list[str], optional): Order of conv module.
             Defaults to ('conv', 'norm', 'act').
         norm_cfg (dict, optional): Config of normalization layer. Defaults to
@@ -47,10 +53,8 @@ class BEVFusionSparseEncoder(SparseEncoder):
     def __init__(
         self,
         in_channels,
-        aug_features_min_values,
-        aug_features_max_values,
-        num_aug_features,
         sparse_shape,
+        dense_output_shapes,
         order=("conv", "norm", "act"),
         norm_cfg=dict(type="BN1d", eps=1e-3, momentum=0.01),
         base_channels=16,
@@ -63,10 +67,8 @@ def __init__(
         super(SparseEncoder, self).__init__()
         assert block_type in ["conv_module", "basicblock"]
         self.sparse_shape = sparse_shape
+        self.dense_output_shapes = dense_output_shapes
         self.in_channels = in_channels
-        self.register_buffer("aug_features_min_values", torch.tensor(aug_features_min_values))
-        self.register_buffer("aug_features_max_values", torch.tensor(aug_features_max_values))
-        self.num_aug_features = num_aug_features
         self.order = order
         self.base_channels = base_channels
         self.output_channels = output_channels
@@ -77,10 +79,6 @@ def __init__(
         self.return_middle_feats = return_middle_feats
         # Spconv init all weight on its own
 
-        if num_aug_features:
-            self.in_channels = in_channels * num_aug_features * 2
-            self.register_buffer("exponents", (2 ** torch.arange(0, num_aug_features).float()))
-
         assert isinstance(order, tuple) and len(order) == 3
         assert set(order) == {"conv", "norm", "act"}
 
@@ -140,16 +138,6 @@ def forward(self, voxel_features, coors, batch_size):
                 output features. When self.return_middle_feats is True, the
                 module returns middle features.
         """
-
-        if self.num_aug_features:
-            num_points = voxel_features.shape[0]
-            x = (voxel_features - self.aug_features_min_values.view(1, -1)) / (
-                self.aug_features_max_values - self.aug_features_min_values
-            ).view(1, -1)
-            y = x.reshape(-1, 1) * np.pi * self.exponents.reshape(1, -1)
-            y = y.reshape(num_points, -1)
-            voxel_features = torch.cat([torch.cos(y), torch.sin(y)], dim=1)
-
         coors = coors.int()
         input_sp_tensor = SparseConvTensor(voxel_features, coors, self.sparse_shape, batch_size)
         x = self.conv_input(input_sp_tensor)
@@ -162,11 +150,16 @@ def forward(self, voxel_features, coors, batch_size):
         # for detection head
         # [200, 176, 5] -> [200, 176, 2]
         out = self.conv_out(encode_features[-1])
-        spatial_features = out.dense()
 
-        N, C, H, W, D = spatial_features.shape
-        spatial_features = spatial_features.permute(0, 1, 4, 2, 3).contiguous()
-        spatial_features = spatial_features.view(N, C * D, H, W)
+        spatial_features = sparse_to_dense(out, batch_size, self.dense_output_shapes, self.output_channels)
+        # spatial_features = out.dense(channels_first=False)
+        spatial_features = spatial_features.permute(0, 4, 3, 1, 2).contiguous()
+        spatial_features = spatial_features.view(
+            batch_size,
+            self.output_channels * self.dense_output_shapes[2],
+            self.dense_output_shapes[0],
+            self.dense_output_shapes[1],
+        )
 
         if self.return_middle_feats:
             return spatial_features, encode_features
diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py
index 7e9faca24..31d0cc417 100644
--- a/projects/BEVFusion/bevfusion/transforms_3d.py
+++ b/projects/BEVFusion/bevfusion/transforms_3d.py
@@ -188,6 +188,19 @@ def transform(self, input_dict: dict) -> dict:
         return input_dict
 
 
+@TRANSFORMS.register_module()
+class BEVFusionRemoveLiDARPoints(BaseTransform):
+    """Remove LiDAR points from the data."""
+
+    def __init__(self):
+        super().__init__()
+
+    def transform(self, results: Dict[str, Any]) -> Dict[str, Any]:
+        if "points" in results:
+            results["points"] = None
+        return results
+
+
 @TRANSFORMS.register_module()
 class GridMask(BaseTransform):
 
diff --git a/projects/BEVFusion/bevfusion/utils.py b/projects/BEVFusion/bevfusion/utils.py
index c47604dbd..39c6a0ded 100644
--- a/projects/BEVFusion/bevfusion/utils.py
+++ b/projects/BEVFusion/bevfusion/utils.py
@@ -85,26 +85,31 @@ def decode(self, heatmap, rot, dim, center, height, vel, filter=False):
             final_box_preds = torch.cat([center, height, dim, rot, vel], dim=1).permute(0, 2, 1)
 
         predictions_dicts = []
-        for i in range(heatmap.shape[0]):
-            boxes3d = final_box_preds[i]
-            scores = final_scores[i]
-            labels = final_preds[i]
-            predictions_dict = {"bboxes": boxes3d, "scores": scores, "labels": labels}
-            predictions_dicts.append(predictions_dict)
-
-        if filter is False:
+        if not filter:
+            for i in range(heatmap.shape[0]):
+                boxes3d = final_box_preds[i]
+                scores = final_scores[i]
+                labels = final_preds[i]
+                predictions_dict = {"bboxes": boxes3d, "scores": scores, "labels": labels}
+                predictions_dicts.append(predictions_dict)
             return predictions_dicts
 
         # use score threshold
         if self.score_threshold is not None:
-            thresh_mask = final_scores > self.score_threshold
+            if isinstance(self.score_threshold, float):
+                thresh_mask = final_scores > self.score_threshold
+            elif isinstance(self.score_threshold, (list, tuple)):
+                score_threshold = final_scores.new_tensor(self.score_threshold)
+                thresh_mask = final_scores > score_threshold[final_preds]
+            else:
+                raise ValueError("score_threshold must be a float or list")
 
+        predictions_dicts = []
         if self.post_center_range is not None:
             self.post_center_range = torch.tensor(self.post_center_range, device=heatmap.device)
             mask = (final_box_preds[..., :3] >= self.post_center_range[:3]).all(2)
             mask &= (final_box_preds[..., :3] <= self.post_center_range[3:]).all(2)
 
-            predictions_dicts = []
             for i in range(heatmap.shape[0]):
                 cmask = mask[i, :]
                 if self.score_threshold:
@@ -114,7 +119,6 @@ def decode(self, heatmap, rot, dim, center, height, vel, filter=False):
                 scores = final_scores[i, cmask]
                 labels = final_preds[i, cmask]
                 predictions_dict = {"bboxes": boxes3d, "scores": scores, "labels": labels}
-
                 predictions_dicts.append(predictions_dict)
         else:
             raise NotImplementedError(
diff --git a/projects/BEVFusion/configs/deploy/bevfusion_main_body_lidar_only_intensity_tensorrt_dynamic.py b/projects/BEVFusion/configs/deploy/bevfusion_main_body_lidar_only_intensity_tensorrt_dynamic.py
index e22e0f41b..2652b3965 100644
--- a/projects/BEVFusion/configs/deploy/bevfusion_main_body_lidar_only_intensity_tensorrt_dynamic.py
+++ b/projects/BEVFusion/configs/deploy/bevfusion_main_body_lidar_only_intensity_tensorrt_dynamic.py
@@ -1,11 +1,7 @@
 codebase_config = dict(type="mmdet3d", task="VoxelDetection", model_type="end2end")
 
 custom_imports = dict(
-    imports=[
-        "projects.BEVFusion.deploy",
-        "projects.BEVFusion.bevfusion",
-        "projects.SparseConvolution",
-    ],
+    imports=["projects.BEVFusion.deploy", "projects.BEVFusion.bevfusion", "projects.SparseConvolution"],
     allow_failed_imports=False,
 )
 
@@ -29,7 +25,7 @@
     type="onnx",
     export_params=True,
     keep_initializers_as_inputs=False,
-    opset_version=17,
+    opset_version=18,
     save_file="bevfusion_lidar_intensity.onnx",
     input_names=["voxels", "coors", "num_points_per_voxel"],
     output_names=["bbox_pred", "score", "label_pred"],
@@ -45,5 +41,5 @@
         },
     },
     input_shape=None,
-    verbose=True,
+    verbose=False,
 )
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_120m.py
similarity index 75%
rename from projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py
rename to projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_120m.py
index e65c52ece..987c13393 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_120m.py
@@ -1,9 +1,8 @@
 _base_ = [
     "../../../../../autoware_ml/configs/detection3d/default_runtime.py",
     "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py",
-    "../default/pipelines/default_camera_lidar_intensity_120m.py",
-    "../default/models/default_camera_swin_fpn_120m.py",
-    "../default/schedulers/default_30e_8xb8_adamw_linear_cosine.py",
+    "../default/pipelines/cameras/default_camera_120m.py",
+    "../default/schedulers/default_30e_8xb16_adamw_linear_cosine.py",
     "../default/default_misc.py",
 ]
 
@@ -13,35 +12,7 @@
 
 # user setting
 data_root = "data/t4dataset/"
-info_directory_path = "info/user_name/"
-
-experiment_group_name = "bevfusion_camera/j6gen2_base/" + _base_.dataset_type
-experiment_name = "bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m"
-work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
-
-# model parameter
-model = dict(
-    type="BEVFusion",
-    view_transform=dict(image_size=_base_.image_size),
-    bbox_head=dict(
-        class_names=_base_.class_names,
-        in_channels=80,
-        train_cfg=dict(
-            point_cloud_range=_base_.point_cloud_range,
-            grid_size=_base_.grid_size,
-            voxel_size=_base_.voxel_size,
-        ),
-        test_cfg=dict(
-            grid_size=_base_.grid_size,
-            voxel_size=_base_.voxel_size[0:2],
-            pc_range=_base_.point_cloud_range[0:2],
-        ),
-        bbox_coder=dict(
-            pc_range=_base_.point_cloud_range[0:2],
-            voxel_size=_base_.voxel_size[0:2],
-        ),
-    ),
-)
+info_directory_path = "info/kokseang_2_8_0/"
 
 # Dataset parameters
 train_dataloader = dict(
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_50m.py
new file mode 100644
index 000000000..ffe9f1363
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_50m.py
@@ -0,0 +1,108 @@
+_base_ = [
+    "../../../../../autoware_ml/configs/detection3d/default_runtime.py",
+    "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py",
+    "../default/pipelines/cameras/default_camera_50m.py",
+    "../default/schedulers/default_30e_8xb16_adamw_linear_cosine.py",
+    "../default/default_misc.py",
+]
+
+custom_imports = dict(imports=["projects.BEVFusion.bevfusion"], allow_failed_imports=False)
+custom_imports["imports"] += _base_.custom_imports["imports"]
+custom_imports["imports"] += ["autoware_ml.detection3d.datasets.transforms"]
+
+# user setting
+data_root = "data/t4dataset/"
+info_directory_path = "info/kokseang_2_8/"
+
+# Dataset parameters
+train_dataloader = dict(
+    batch_size=_base_.train_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=True),
+    dataset=dict(
+        type=_base_.dataset_type,
+        pipeline=_base_.train_pipeline,
+        modality=_base_.input_modality,
+        backend_args=_base_.backend_args,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_train_file_name,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        test_mode=False,
+        data_prefix=_base_.data_prefix,
+        box_type_3d="LiDAR",
+        filter_cfg=_base_.filter_cfg,
+    ),
+)
+
+val_dataloader = dict(
+    batch_size=_base_.test_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=_base_.dataset_type,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_val_file_name,
+        pipeline=_base_.test_pipeline,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        modality=_base_.input_modality,
+        data_prefix=_base_.data_prefix,
+        test_mode=True,
+        box_type_3d="LiDAR",
+        backend_args=_base_.backend_args,
+    ),
+)
+
+test_dataloader = dict(
+    batch_size=_base_.test_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=_base_.dataset_type,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_test_file_name,
+        pipeline=_base_.test_pipeline,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        modality=_base_.input_modality,
+        data_prefix=_base_.data_prefix,
+        test_mode=True,
+        box_type_3d="LiDAR",
+        backend_args=_base_.backend_args,
+    ),
+)
+
+val_evaluator = dict(
+    type="T4Metric",
+    data_root=data_root,
+    ann_file=data_root + info_directory_path + _base_.info_val_file_name,
+    metric="bbox",
+    backend_args=_base_.backend_args,
+    class_names=_base_.class_names,
+    name_mapping=_base_.name_mapping,
+    eval_class_range=_base_.eval_class_range,
+    filter_attributes=_base_.filter_attributes,
+)
+
+test_evaluator = dict(
+    type="T4Metric",
+    data_root=data_root,
+    ann_file=data_root + info_directory_path + _base_.info_test_file_name,
+    metric="bbox",
+    backend_args=_base_.backend_args,
+    class_names=_base_.class_names,
+    name_mapping=_base_.name_mapping,
+    eval_class_range=_base_.eval_class_range,
+    filter_attributes=_base_.filter_attributes,
+    save_csv=True,
+)
+
+default_hooks = dict(
+    logger=dict(type="LoggerHook", interval=50),
+    checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"),
+)
+log_processor = dict(window_size=50)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_120m.py
new file mode 100644
index 000000000..7a81be126
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_120m.py
@@ -0,0 +1,108 @@
+_base_ = [
+    "../../../../../autoware_ml/configs/detection3d/default_runtime.py",
+    "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/base.py",
+    "../default/pipelines/cameras/default_camera_120m.py",
+    "../default/schedulers/default_50e_8xb16_adamw_linear_cosine.py",
+    "../default/default_misc.py",
+]
+
+custom_imports = dict(imports=["projects.BEVFusion.bevfusion"], allow_failed_imports=False)
+custom_imports["imports"] += _base_.custom_imports["imports"]
+custom_imports["imports"] += ["autoware_ml.detection3d.datasets.transforms"]
+
+# user setting
+data_root = "data/t4dataset/"
+info_directory_path = "info/kokseang_2_8_0/"
+
+# Dataset parameters
+train_dataloader = dict(
+    batch_size=_base_.train_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=True),
+    dataset=dict(
+        type=_base_.dataset_type,
+        pipeline=_base_.train_pipeline,
+        modality=_base_.input_modality,
+        backend_args=_base_.backend_args,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_train_file_name,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        test_mode=False,
+        data_prefix=_base_.data_prefix,
+        box_type_3d="LiDAR",
+        filter_cfg=_base_.filter_cfg,
+    ),
+)
+
+val_dataloader = dict(
+    batch_size=_base_.test_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=_base_.dataset_type,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_val_file_name,
+        pipeline=_base_.test_pipeline,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        modality=_base_.input_modality,
+        data_prefix=_base_.data_prefix,
+        test_mode=True,
+        box_type_3d="LiDAR",
+        backend_args=_base_.backend_args,
+    ),
+)
+
+test_dataloader = dict(
+    batch_size=_base_.test_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=_base_.dataset_type,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_test_file_name,
+        pipeline=_base_.test_pipeline,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        modality=_base_.input_modality,
+        data_prefix=_base_.data_prefix,
+        test_mode=True,
+        box_type_3d="LiDAR",
+        backend_args=_base_.backend_args,
+    ),
+)
+
+val_evaluator = dict(
+    type="T4Metric",
+    data_root=data_root,
+    ann_file=data_root + info_directory_path + _base_.info_val_file_name,
+    metric="bbox",
+    backend_args=_base_.backend_args,
+    class_names=_base_.class_names,
+    name_mapping=_base_.name_mapping,
+    eval_class_range=_base_.eval_class_range,
+    filter_attributes=_base_.filter_attributes,
+)
+
+test_evaluator = dict(
+    type="T4Metric",
+    data_root=data_root,
+    ann_file=data_root + info_directory_path + _base_.info_test_file_name,
+    metric="bbox",
+    backend_args=_base_.backend_args,
+    class_names=_base_.class_names,
+    name_mapping=_base_.name_mapping,
+    eval_class_range=_base_.eval_class_range,
+    filter_attributes=_base_.filter_attributes,
+    save_csv=True,
+)
+
+default_hooks = dict(
+    logger=dict(type="LoggerHook", interval=50),
+    checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"),
+)
+log_processor = dict(window_size=50)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_50m.py
new file mode 100644
index 000000000..4b79e2102
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_50m.py
@@ -0,0 +1,108 @@
+_base_ = [
+    "../../../../../autoware_ml/configs/detection3d/default_runtime.py",
+    "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/base.py",
+    "../default/pipelines/cameras/default_camera_50m.py",
+    "../default/schedulers/default_50e_8xb16_adamw_linear_cosine.py",
+    "../default/default_misc.py",
+]
+
+custom_imports = dict(imports=["projects.BEVFusion.bevfusion"], allow_failed_imports=False)
+custom_imports["imports"] += _base_.custom_imports["imports"]
+custom_imports["imports"] += ["autoware_ml.detection3d.datasets.transforms"]
+
+# user setting
+data_root = "data/t4dataset/"
+info_directory_path = "info/kokseang_2_8_0/"
+
+# Dataset parameters
+train_dataloader = dict(
+    batch_size=_base_.train_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=True),
+    dataset=dict(
+        type=_base_.dataset_type,
+        pipeline=_base_.train_pipeline,
+        modality=_base_.input_modality,
+        backend_args=_base_.backend_args,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_train_file_name,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        test_mode=False,
+        data_prefix=_base_.data_prefix,
+        box_type_3d="LiDAR",
+        filter_cfg=_base_.filter_cfg,
+    ),
+)
+
+val_dataloader = dict(
+    batch_size=_base_.test_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=_base_.dataset_type,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_val_file_name,
+        pipeline=_base_.test_pipeline,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        modality=_base_.input_modality,
+        data_prefix=_base_.data_prefix,
+        test_mode=True,
+        box_type_3d="LiDAR",
+        backend_args=_base_.backend_args,
+    ),
+)
+
+test_dataloader = dict(
+    batch_size=_base_.test_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=_base_.dataset_type,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_test_file_name,
+        pipeline=_base_.test_pipeline,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        modality=_base_.input_modality,
+        data_prefix=_base_.data_prefix,
+        test_mode=True,
+        box_type_3d="LiDAR",
+        backend_args=_base_.backend_args,
+    ),
+)
+
+val_evaluator = dict(
+    type="T4Metric",
+    data_root=data_root,
+    ann_file=data_root + info_directory_path + _base_.info_val_file_name,
+    metric="bbox",
+    backend_args=_base_.backend_args,
+    class_names=_base_.class_names,
+    name_mapping=_base_.name_mapping,
+    eval_class_range=_base_.eval_class_range,
+    filter_attributes=_base_.filter_attributes,
+)
+
+test_evaluator = dict(
+    type="T4Metric",
+    data_root=data_root,
+    ann_file=data_root + info_directory_path + _base_.info_test_file_name,
+    metric="bbox",
+    backend_args=_base_.backend_args,
+    class_names=_base_.class_names,
+    name_mapping=_base_.name_mapping,
+    eval_class_range=_base_.eval_class_range,
+    filter_attributes=_base_.filter_attributes,
+    save_csv=True,
+)
+
+default_hooks = dict(
+    logger=dict(type="LoggerHook", interval=50),
+    checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"),
+)
+log_processor = dict(window_size=50)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_30e_8xb16_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_30e_8xb16_j6gen2_base_120m.py
new file mode 100644
index 000000000..92501d169
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_30e_8xb16_j6gen2_base_120m.py
@@ -0,0 +1,31 @@
+_base_ = [
+    "../default_bevfusion_camera_30e_8xb16_j6gen2_base_120m.py",
+    "../../default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py",
+]
+
+experiment_group_name = "bevfusion_camera/j6gen2_base/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_resnet50_fpn_depthlss_30e_8xb16_j6gen2_base_120m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_50e_8xb16_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_50e_8xb16_base_120m.py
new file mode 100644
index 000000000..47c91cfb3
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_50e_8xb16_base_120m.py
@@ -0,0 +1,31 @@
+_base_ = [
+    "../default_bevfusion_camera_50e_8xb16_base_120m.py",
+    "../../default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py",
+]
+
+experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_resnet50_fpn_depthlss_50e_8xb16_base_120m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_30e_2xb16_j6gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_30e_2xb16_j6gen2_base_50m.py
new file mode 100644
index 000000000..9074f14d2
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_30e_2xb16_j6gen2_base_50m.py
@@ -0,0 +1,31 @@
+_base_ = [
+    "../default_bevfusion_camera_30e_8xb16_j6gen2_base_50m.py",
+    "../../default/models/resnet50/camera_resnet50_fpn_lss_depth_50m.py",
+]
+
+experiment_group_name = "bevfusion_camera/j6gen2_base_depth_adjust_v2/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_resnet50_fpn_lss_depth_30e_8xb16_j6gen2_base_50m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_50e_8xb16_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_50e_8xb16_base_50m.py
new file mode 100644
index 000000000..40f008b34
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_50e_8xb16_base_50m.py
@@ -0,0 +1,31 @@
+_base_ = [
+    "../default_bevfusion_camera_50e_8xb16_base_50m.py",
+    "../../default/models/resnet50/camera_resnet50_fpn_camera_aware_lssv2_50m.py",
+]
+
+experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_resnet50_fpn_camera_depth_aware_lssv2_50e_8xb16_base_50m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_depthlss_50e_8xb16_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_depthlss_50e_8xb16_base_120m.py
new file mode 100644
index 000000000..56c2930bb
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_depthlss_50e_8xb16_base_120m.py
@@ -0,0 +1,32 @@
+_base_ = [
+    "../default_bevfusion_camera_50e_8xb16_base_50m.py",
+    "../../default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py",
+]
+
+experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_swin_fpn_depthlss_50e_8xb16_base_120m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        in_channels=80,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_base_50m.py
new file mode 100644
index 000000000..8d1ff7681
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_base_50m.py
@@ -0,0 +1,32 @@
+_base_ = [
+    "../default_bevfusion_camera_50e_8xb16_base_50m.py",
+    "../../default/models/swin_transformer/camera_swin_fpn_lss_50m.py",
+]
+
+experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_swin_fpn_lss_50e_8xb16_base_50m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        in_channels=80,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_120m.py
new file mode 100644
index 000000000..401ac7861
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_120m.py
@@ -0,0 +1,32 @@
+_base_ = [
+    "../default_bevfusion_camera_30e_8xb16_j6gen2_base_120m.py",
+    "../../default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py",
+]
+
+experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_swin_fpn_depthlss_50e_8xb16_j6gen2_base_120m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        in_channels=80,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_50m.py
new file mode 100644
index 000000000..80e81be39
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_50m.py
@@ -0,0 +1,32 @@
+_base_ = [
+    "../default_bevfusion_camera_30e_8xb16_j6gen2_base_50m.py",
+    "../../default/models/swin_transformer/camera_swin_fpn_lss_50m.py",
+]
+
+experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_50m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        in_channels=80,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py
index 4f81af760..a93b1d435 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py
@@ -2,8 +2,8 @@
     "../../../../../autoware_ml/configs/detection3d/default_runtime.py",
     "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py",
     "../default/pipelines/default_camera_lidar_intensity_120m.py",
-    "../default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py",
-    "../default/schedulers/default_20e_8xb8_adamw_linear_cosine.py",
+    "../default/models/default_camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py",
+    "../default/schedulers/default_20e_8xb16_adamw_linear_cosine.py",
     "../default/default_misc.py",
 ]
 
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py
index 20c85b1d8..b8408956b 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py
@@ -2,8 +2,8 @@
     "../../../../../autoware_ml/configs/detection3d/default_runtime.py",
     "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py",
     "../default/pipelines/default_camera_lidar_intensity_120m.py",
-    "../default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py",
-    "../default/schedulers/default_20e_8xb8_adamw_linear_cosine.py",
+    "../default/models/default_camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py",
+    "../default/schedulers/default_20e_8xb16_adamw_linear_cosine.py",
     "../default/default_misc.py",
 ]
 
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py
similarity index 88%
rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m.py
rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py
index d32dc9c70..bcf9870c6 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py
@@ -3,7 +3,7 @@
     "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py",
     "../default/pipelines/default_lidar_intensity_120m.py",
     "../default/models/default_lidar_second_secfpn_120m.py",
-    "../default/schedulers/default_30e_8xb8_adamw_cosine.py",
+    "../default/schedulers/default_30e_8xb16_adamw_cosine.py",
     "../default/default_misc.py",
 ]
 
@@ -13,10 +13,10 @@
 
 # user setting
 data_root = "data/t4dataset/"
-info_directory_path = "info/user_name/"
+info_directory_path = "info/kokseang_2_8_1/"
 
-experiment_group_name = "bevfusion_lidar_intensity/j6gen2_base/" + _base_.dataset_type
-experiment_name = "lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m"
+experiment_group_name = "bevfusion_lidar_intensity_2_8_3/j6gen2_base_normal_dense/" + _base_.dataset_type
+experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m"
 work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
 
 # model parameter
@@ -25,22 +25,18 @@
     voxelize_cfg=dict(
         point_cloud_range=_base_.point_cloud_range,
         voxel_size=_base_.voxel_size,
-        voxelize_reduce=True,
     ),
-    pts_voxel_encoder=dict(num_features=_base_.point_use_dim),
-    pts_middle_encoder=dict(
-        in_channels=_base_.point_use_dim,
-        sparse_shape=_base_.grid_size,
-        num_aug_features=5,
+    pts_voxel_encoder=dict(
+        in_channels=len(_base_.lidar_sweep_dims),
         # min-max normalization for x, y, z, intensity, time_lag, where the max of time lag technically is two seeps (200 ms) here
-        aug_features_min_values=[
+        min_norm_values=[
             _base_.point_cloud_range[0],
             _base_.point_cloud_range[1],
             _base_.point_cloud_range[2],
             0.0,
             0.0,
         ],
-        aug_features_max_values=[
+        max_norm_values=[
             _base_.point_cloud_range[3],
             _base_.point_cloud_range[4],
             _base_.point_cloud_range[5],
@@ -48,6 +44,11 @@
             0.2,
         ],
     ),
+    pts_middle_encoder=dict(
+        in_channels=50,
+        sparse_shape=_base_.grid_size,
+        dense_output_shapes=_base_.sparse_dense_output_shapes,
+    ),
     bbox_head=dict(
         class_names=_base_.class_names,  # Use class names to identify the correct class indices
         train_cfg=dict(
@@ -164,4 +165,4 @@
 )
 log_processor = dict(window_size=50)
 
-load_from = None
+load_from = "work_dirs/bevfusion_lidar_2_8_2/base/T4Dataset/lidar_voxel_second_secfpn_50e_8xb16_base_120m/epoch_50.pth"
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m_t4metric_v2.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m_t4metric_v2.py
similarity index 91%
rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m_t4metric_v2.py
rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m_t4metric_v2.py
index 260080bc6..3bdda213e 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m_t4metric_v2.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m_t4metric_v2.py
@@ -1,10 +1,10 @@
 _base_ = [
-    "./bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m.py",
+    "./bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py",
 ]
 
 # user setting
-experiment_group_name = "bevfusion_lidar_intensity/j6gen2_base/" + _base_.dataset_type
-experiment_name = "lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m_t4metric_v2"
+experiment_group_name = "bevfusion_lidar_intensity_2_8_1/j6gen2_base/" + _base_.dataset_type
+experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m_t4metric_v2"
 work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
 
 # Add evaluator configs
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py
similarity index 88%
rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m.py
rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py
index 8fa1509b9..1f2acd6ab 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py
@@ -3,7 +3,7 @@
     "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py",
     "../default/pipelines/default_lidar_intensity_120m.py",
     "../default/models/default_lidar_second_secfpn_120m.py",
-    "../default/schedulers/default_30e_8xb8_adamw_cosine.py",
+    "../default/schedulers/default_30e_8xb16_adamw_cosine.py",
     "../default/default_misc.py",
 ]
 
@@ -13,10 +13,10 @@
 
 # user setting
 data_root = "data/t4dataset/"
-info_directory_path = "info/user_name/"
+info_directory_path = "info/kokseang_2_8_1/"
 
-experiment_group_name = "bevfusion_lidar_intensity/jpntaxi_base/" + _base_.dataset_type
-experiment_name = "lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m"
+experiment_group_name = "bevfusion_lidar_intensity_2_8_2/jpntaxi_base/" + _base_.dataset_type
+experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m"
 work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
 
 # model parameter
@@ -25,22 +25,18 @@
     voxelize_cfg=dict(
         point_cloud_range=_base_.point_cloud_range,
         voxel_size=_base_.voxel_size,
-        voxelize_reduce=True,
     ),
-    pts_voxel_encoder=dict(num_features=_base_.point_use_dim),
-    pts_middle_encoder=dict(
-        in_channels=_base_.point_use_dim,
-        sparse_shape=_base_.grid_size,
-        num_aug_features=5,
+    pts_voxel_encoder=dict(
+        in_channels=len(_base_.lidar_sweep_dims),
         # min-max normalization for x, y, z, intensity, time_lag, where the max of time lag technically is two seeps (200 ms) here
-        aug_features_min_values=[
+        min_norm_values=[
             _base_.point_cloud_range[0],
             _base_.point_cloud_range[1],
             _base_.point_cloud_range[2],
             0.0,
             0.0,
         ],
-        aug_features_max_values=[
+        max_norm_values=[
             _base_.point_cloud_range[3],
             _base_.point_cloud_range[4],
             _base_.point_cloud_range[5],
@@ -48,6 +44,11 @@
             0.2,
         ],
     ),
+    pts_middle_encoder=dict(
+        in_channels=50,
+        sparse_shape=_base_.grid_size,
+        dense_output_shapes=_base_.sparse_dense_output_shapes,
+    ),
     bbox_head=dict(
         class_names=_base_.class_names,  # Use class names to identify the correct class indices
         train_cfg=dict(
@@ -164,4 +165,6 @@
 )
 log_processor = dict(window_size=50)
 
-load_from = None
+load_from = (
+    "work_dirs/bevfusion_lidar_2_8_0/base/T4Dataset/lidar_voxel_second_secfpn_50e_8xb16_base_120m/best_epoch_47.pth"
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m_t4metric_v2.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m_t4metric_v2.py
similarity index 91%
rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m_t4metric_v2.py
rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m_t4metric_v2.py
index 3d543e709..64d494655 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m_t4metric_v2.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m_t4metric_v2.py
@@ -1,10 +1,10 @@
 _base_ = [
-    "./bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m.py",
+    "./bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py",
 ]
 
 # user setting
-experiment_group_name = "bevfusion_lidar_intensity/jpntaxi_base/" + _base_.dataset_type
-experiment_name = "lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m_t4metric_v2"
+experiment_group_name = "bevfusion_lidar_intensity_2_8_1/jpntaxi_base/" + _base_.dataset_type
+experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m_t4metric_v2"
 work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
 
 # Add evaluator configs
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py
similarity index 85%
rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m.py
rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py
index e8068332a..446649985 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py
@@ -3,7 +3,7 @@
     "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/base.py",
     "../default/pipelines/default_lidar_120m.py",
     "../default/models/default_lidar_second_secfpn_120m.py",
-    "../default/schedulers/default_50e_8xb8_adamw_cosine.py",
+    "../default/schedulers/default_50e_8xb16_adamw_cosine.py",
     "../default/default_misc.py",
 ]
 
@@ -13,10 +13,10 @@
 
 # user setting
 data_root = "data/t4dataset/"
-info_directory_path = "info/user_name/"
+info_directory_path = "info/kokseang_2_8_1/"
 
-experiment_group_name = "bevfusion_lidar/base/" + _base_.dataset_type
-experiment_name = "lidar_voxel_second_secfpn_50e_8xb8_base_120m"
+experiment_group_name = "bevfusion_lidar_2_8_0/base_more_filters/" + _base_.dataset_type
+experiment_name = "lidar_voxel_second_secfpn_50e_8xb16_base_120m"
 work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
 
 # model parameter
@@ -25,26 +25,17 @@
     voxelize_cfg=dict(
         point_cloud_range=_base_.point_cloud_range,
         voxel_size=_base_.voxel_size,
-        voxelize_reduce=True,
     ),
-    pts_voxel_encoder=dict(num_features=_base_.point_use_dim),
+    pts_voxel_encoder=dict(
+        in_channels=len(_base_.lidar_sweep_dims),
+        # min-max normalization for x, y, z, time_lag, where the max of time lag technically is two seeps (200 ms) here
+        min_norm_values=[_base_.point_cloud_range[0], _base_.point_cloud_range[1], _base_.point_cloud_range[2], 0.0],
+        max_norm_values=[_base_.point_cloud_range[3], _base_.point_cloud_range[4], _base_.point_cloud_range[5], 0.2],
+    ),
     pts_middle_encoder=dict(
-        in_channels=_base_.point_use_dim,
+        in_channels=32,
         sparse_shape=_base_.grid_size,
-        num_aug_features=4,
-        # min-max normalization for x, y, z, time_lag, where the max of time lag technically is two seeps (200 ms) here
-        aug_features_min_values=[
-            _base_.point_cloud_range[0],
-            _base_.point_cloud_range[1],
-            _base_.point_cloud_range[2],
-            0.0,
-        ],
-        aug_features_max_values=[
-            _base_.point_cloud_range[3],
-            _base_.point_cloud_range[4],
-            _base_.point_cloud_range[5],
-            0.2,
-        ],
+        dense_output_shapes=_base_.sparse_dense_output_shapes,
     ),
     bbox_head=dict(
         class_names=_base_.class_names,  # Use class names to identify the correct class indices
@@ -161,3 +152,5 @@
     checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"),
 )
 log_processor = dict(window_size=50)
+
+resume = True
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m_t4metric_v2.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m_t4metric_v2.py
similarity index 97%
rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m_t4metric_v2.py
rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m_t4metric_v2.py
index efcd091f5..98a65a3f9 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m_t4metric_v2.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m_t4metric_v2.py
@@ -18,7 +18,7 @@
 frame_pass_fail_config = dict(
     target_labels=_base_.class_names,
     # Matching thresholds per class (must align with `plane_distance_thresholds` used in evaluation)
-    matching_threshold_list=[2.0, 2.0, 2.0, 2.0, 2.0],
+    matching_threshold_list=[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0],
     confidence_threshold_list=None,
 )
 
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_resnet50_fpn_depthlss_lidar_second_secfpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/default_camera_resnet50_fpn_depthlss_lidar_second_secfpn_120m.py
new file mode 100644
index 000000000..339f3e97e
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/models/default_camera_resnet50_fpn_depthlss_lidar_second_secfpn_120m.py
@@ -0,0 +1,53 @@
+_base_ = [
+    "./default_lidar_second_secfpn_120m.py",
+]
+
+# Image network
+model = dict(
+    data_preprocessor=dict(
+        type="Det3DDataPreprocessor",
+        pad_size_divisor=32,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=False,
+        rgb_to_bgr=False,
+    ),
+    img_backbone=dict(
+        pretrained="torchvision://resnet50",
+        type="ResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type="BN2d", requires_grad=True),
+        norm_eval=False,
+        with_cp=False,
+        style="pytorch",
+        init_cfg=dict(
+            type="Pretrained",
+            checkpoint="work_dirs/resnet50/mmdet_resnet50-19c8e357.pth",  # noqa: E251
+        ),
+    ),
+    img_neck=dict(
+        type="GeneralizedLSSFPN",
+        in_channels=[1024, 2048],
+        out_channels=256,
+        start_level=0,
+        num_outs=2,
+        norm_cfg=dict(type="BN2d", requires_grad=True),
+        act_cfg=dict(type="ReLU", inplace=True),
+        upsample_cfg=dict(mode="bilinear", align_corners=False),
+    ),
+    view_transform=dict(
+        type="DepthLSSTransform",
+        in_channels=256,
+        out_channels=80,
+        feature_size=[48, 96],
+        xbound=[-122.40, 122.40, 0.68],
+        ybound=[-122.40, 122.40, 0.68],
+        zbound=[-10.0, 10.0, 20.0],
+        dbound=[1.0, 130, 1.0],
+        downsample=2,
+    ),
+    fusion_layer=dict(type="ConvFuser", in_channels=[80, 256], out_channels=256, kernel_size=5, stride=2, padding=2),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m.py
index 809179b20..f11431814 100644
--- a/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m.py
@@ -1,25 +1,25 @@
 num_proposals = 500
-max_num_points = 10
+max_num_points = 32
 max_voxels = [120000, 160000]
+out_size_factor = 8
 
 model = dict(
     type="BEVFusion",
     voxelize_cfg=dict(
         max_num_points=max_num_points,
         max_voxels=max_voxels,
-        voxelize_reduce=True,
     ),
     data_preprocessor=dict(
         type="Det3DDataPreprocessor",
         pad_size_divisor=32,
     ),
-    pts_voxel_encoder=dict(type="HardSimpleVFE"),
+    pts_voxel_encoder=dict(
+        type="HardSimpleVoxelSinCosEncoder",
+        in_channels=4,
+    ),
     pts_middle_encoder=dict(
         type="BEVFusionSparseEncoder",
         in_channels=5,
-        aug_features_min_values=[],
-        aug_features_max_values=[],
-        num_aug_features=0,
         order=("conv", "norm", "act"),
         norm_cfg=dict(type="BN1d", eps=0.001, momentum=0.01),
         encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)),
@@ -69,7 +69,7 @@
         ),
         train_cfg=dict(
             dataset="t4datasets",
-            out_size_factor=8,
+            out_size_factor=out_size_factor,
             gaussian_overlap=0.1,
             min_radius=2,
             pos_weight=-1,
@@ -84,15 +84,18 @@
         ),
         test_cfg=dict(
             dataset="t4datasets",
-            out_size_factor=8,
-            nms_type=None,  # Set to "circle" for circle_nms
+            out_size_factor=out_size_factor,
+            nms_type="circle",  # Set to "circle" for circle_nms
             # Set NMS for different clusters
             nms_clusters=[
-                dict(class_names=["car", "truck", "bus"], nms_threshold=0.5),  # It's radius if using circle_nms
-                dict(class_names=["bicycle"], nms_threshold=0.5),
-                dict(class_names=["pedestrian"], nms_threshold=0.175),
-                dict(class_names=["barrier"], nms_threshold=0.5),
-                dict(class_names=["traffic_cone"], nms_threshold=0.175),
+                # Sqrt(0.25) = 0.50
+                dict(
+                    class_names=["car", "truck", "bus"], class_indices=[0, 1, 2], nms_threshold=0.25, post_max_size=300
+                ),  # It's radius if using circle_nms
+                dict(class_names=["bicycle"], class_indices=[3], nms_threshold=0.0, post_max_size=50),
+                dict(class_names=["pedestrian"], class_indices=[4], nms_threshold=0.0, post_max_size=100),
+                dict(class_names=["traffic_cone"], class_indices=[5], nms_threshold=0.0, post_max_size=100),
+                dict(class_names=["barrier"], class_indices=[6], nms_threshold=0.0, post_max_size=50),
             ],
         ),
         dense_heatmap_pooling_classes=["car", "truck", "bus", "bicycle", "barrier"],  # Use class indices for pooling
@@ -100,7 +103,9 @@
         bbox_coder=dict(
             type="TransFusionBBoxCoder",
             post_center_range=[-200.0, -200.0, -10.0, 200.0, 200.0, 10.0],
-            score_threshold=0.0,
+            # score_threshold=0.03,
+            # CAR, TRUCK, BUS, BICYCLE, PEDESTRIAN, TRAFFIC_CONE, BARRIER
+            score_threshold=[0.015, 0.010, 0.010, 0.020, 0.030, 0.040, 0.020],
             out_size_factor=8,
             code_size=10,
         ),
@@ -112,8 +117,10 @@
             reduction="mean",
             loss_weight=1.0,
         ),
-        loss_heatmap=dict(type="mmdet.GaussianFocalLoss", reduction="mean", loss_weight=1.0),
+        loss_iou=None,
+        loss_heatmap=dict(type="mmdet.GaussianFocalLoss", reduction="none", loss_weight=1.0),
         loss_bbox=dict(type="mmdet.L1Loss", reduction="mean", loss_weight=0.25),
-        partial_ignore_labels=None,
+        # partial_
+        partial_ignore_labels=["traffic_cone", "barrier"],
     ),
 )
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py
new file mode 100644
index 000000000..e90687fe3
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py
@@ -0,0 +1,10 @@
+_base_ = [
+    "./default_lidar_second_secfpn_120m.py",
+]
+
+model = dict(
+    bbox_head=dict(
+        common_heads=dict(center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2], iou=[1, 2]),
+        loss_iou=dict(type="mmdet.L1Loss", reduction="mean", loss_weight=1.0),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py
new file mode 100644
index 000000000..c807668a3
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py
@@ -0,0 +1,60 @@
+_base_ = [
+    "../default_lidar_second_secfpn_120m.py",
+]
+
+# Image network
+model = dict(
+    # Remove all lidar related configs
+    voxelize_cfg=None,
+    pts_voxel_encoder=None,
+    pts_middle_encoder=None,
+    pts_neck=None,
+    pts_backbone=None,
+    data_preprocessor=dict(
+        type="Det3DDataPreprocessor",
+        pad_size_divisor=32,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=False,
+        rgb_to_bgr=False,
+    ),
+    img_backbone=dict(
+        type="mmdet.ResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type="BN2d", requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style="pytorch",
+        init_cfg=dict(
+            type="Pretrained",
+            checkpoint="work_dirs/resnet50/mmdet_resnet50-19c8e357.pth",  # noqa: E251
+        ),
+    ),
+    img_neck=dict(
+        type="GeneralizedLSSFPN",
+        in_channels=[512, 1024, 2048],
+        out_channels=256,
+        start_level=0,
+        num_outs=2,
+        norm_cfg=dict(type="BN2d", requires_grad=True),
+        act_cfg=dict(type="ReLU", inplace=True),
+        upsample_cfg=dict(mode="bilinear", align_corners=False),
+    ),
+    view_transform=dict(
+        type="DepthLSSTransform",
+        in_channels=256,
+        out_channels=80,
+        feature_size=[48, 96],
+        xbound=[-122.40, 122.40, 0.68],
+        ybound=[-122.40, 122.40, 0.68],
+        zbound=[-10.0, 10.0, 20.0],
+        dbound=[1.0, 130, 1.0],
+        downsample=2,
+    ),
+    bbox_head=dict(
+        in_channels=80,
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_50m.py b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_50m.py
new file mode 100644
index 000000000..1457207c8
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_50m.py
@@ -0,0 +1,24 @@
+_base_ = [
+    "./camera_resnet50_fpn_depthlss_120m.py",
+]
+num_proposals = 200
+
+# Image network
+model = dict(
+    depth_gt_downsample=8,
+    loss_depth_weight=2.0,
+    view_transform=dict(
+        type="LSSTransformV2",
+        xbound=[-54.0, 54.0, 0.3],
+        ybound=[-54.0, 54.0, 0.3],
+        zbound=[-10.0, 10.0, 20.0],
+        dbound=[1.0, 60, 0.5],
+        downsample=2,
+    ),
+    bbox_head=dict(
+        num_proposals=num_proposals,
+        bbox_coder=dict(
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depth_50m.py b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depth_50m.py
new file mode 100644
index 000000000..dd90ccb6e
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depth_50m.py
@@ -0,0 +1,26 @@
+_base_ = [
+    "./camera_resnet50_fpn_depthlss_120m.py",
+]
+num_proposals = 200
+
+# Image network
+model = dict(
+    depth_gt_downsample=8,
+    loss_depth_weight=1.0,
+    view_transform=dict(
+        # type="LSSTransformV2",
+        type="LSSTransformV2DepthAware",
+        xbound=[-54.0, 54.0, 0.3],
+        ybound=[-54.0, 54.0, 0.3],
+        zbound=[-10.0, 10.0, 20.0],
+        dbound=[1.0, 60, 0.5],
+        downsample=2,
+        camera_depth_aware_configs=dict(mlp_drop_out=0.0, downsample=8, num_camera_depth_parameters=27),
+    ),
+    bbox_head=dict(
+        num_proposals=num_proposals,
+        bbox_coder=dict(
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py
similarity index 97%
rename from projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py
rename to projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py
index c4b0cd9ab..88e74efc7 100644
--- a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py
@@ -1,5 +1,5 @@
 _base_ = [
-    "./default_lidar_second_secfpn_120m.py",
+    "../default_lidar_second_secfpn_120m.py",
 ]
 
 # Image network
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py
similarity index 94%
rename from projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py
rename to projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py
index c4097de3d..2ac22b1b6 100644
--- a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py
@@ -1,5 +1,5 @@
 _base_ = [
-    "./default_lidar_second_secfpn_120m.py",
+    "../default_lidar_second_secfpn_120m.py",
 ]
 
 # Image network
@@ -56,5 +56,5 @@
         dbound=[1.0, 130, 1.0],
         downsample=2,
     ),
-    fusion_layer=dict(type="ConvFuser", in_channels=[80, 256], out_channels=256, kernel_size=5, padding=2),
+    fusion_layer=dict(type="ConvFuser", in_channels=[80, 256], out_channels=256, kernel_size=5, stride=2, padding=2),
 )
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_50m.py b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_50m.py
new file mode 100644
index 000000000..1294416ad
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_50m.py
@@ -0,0 +1,23 @@
+_base_ = [
+    "./default_camera_swin_fpn_depthlss_120m.py",
+]
+
+# Image network
+model = dict(
+    view_transform=dict(
+        type="LSSTransform",
+        in_channels=256,
+        out_channels=80,
+        feature_size=[48, 96],
+        xbound=[-54.0, 54.0, 0.3],
+        ybound=[-54.0, 54.0, 0.3],
+        zbound=[-10.0, 10.0, 20.0],
+        dbound=[1.0, 60, 0.5],
+        downsample=2,
+    ),
+    bbox_head=dict(
+        bbox_coder=dict(
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py
new file mode 100644
index 000000000..3a7a428b7
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py
@@ -0,0 +1,151 @@
+## This config is for the camera_base only model, without lidar points
+
+_base_ = [
+    "../default_lidar_120m.py",
+]
+input_modality = dict(use_lidar=True, use_camera=True)
+
+# Image parameters
+image_size = [384, 768]  # Height, Width
+camera_order = ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"]
+
+train_pipeline = [
+    dict(
+        type="BEVLoadMultiViewImageFromFiles",
+        to_float32=True,
+        color_type="color",
+        backend_args=_base_.backend_args,
+        camera_order=camera_order,
+    ),
+    # We keep loading LiDAR points to make downstream BEV augmentation easier
+    dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=_base_.point_load_dim,
+        use_dim=_base_.point_load_dim,
+        backend_args=_base_.backend_args,
+    ),
+    dict(
+        type="LoadPointsFromMultiSweeps",
+        sweeps_num=_base_.sweeps_num,
+        load_dim=_base_.point_load_dim,
+        use_dim=_base_.lidar_sweep_dims,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=_base_.backend_args,
+        test_mode=False,
+    ),
+    dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(
+        type="ImageAug3D",
+        final_dim=image_size,
+        resize_lim=[0.28, 0.40],
+        bot_pct_lim=[0.0, 0.0],
+        rot_lim=[0.0, 0.0],
+        rand_flip=True,
+        is_train=True,
+    ),
+    dict(
+        type="BEVFusionGlobalRotScaleTrans",
+        scale_ratio_range=[0.95, 1.05],
+        rot_range=[-0.78539816, 0.78539816],
+        translation_std=[0.5, 0.5, 0.2],
+    ),
+    dict(type="BEVFusionRandomFlip3D"),
+    dict(type="ObjectRangeFilter", point_cloud_range=_base_.point_cloud_range),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=5),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=3),
+    dict(
+        type="ObjectNameFilter",
+        classes=[
+            "car",
+            "truck",
+            "bus",
+            "bicycle",
+            "pedestrian",
+            "traffic_cone",
+            "barrier",
+        ],
+    ),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "transformation_3d_flow",
+            "pcd_rotation",
+            "pcd_scale_factor",
+            "pcd_trans",
+            "img_aug_matrix",
+            "lidar_aug_matrix",
+            "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
+            "gt_depths",
+        ],
+    ),
+]
+
+test_pipeline = [
+    dict(
+        type="BEVLoadMultiViewImageFromFiles",
+        to_float32=True,
+        color_type="color",
+        backend_args=_base_.backend_args,
+        camera_order=camera_order,
+    ),
+    dict(
+        type="LoadPointsFromMultiSweeps",
+        sweeps_num=_base_.sweeps_num,
+        load_dim=_base_.point_load_dim,
+        use_dim=_base_.lidar_sweep_dims,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=_base_.backend_args,
+        test_mode=True,
+    ),
+    dict(
+        type="ImageAug3D",
+        final_dim=image_size,
+        resize_lim=[0.34, 0.34],
+        bot_pct_lim=[0.0, 0.0],
+        rot_lim=[0.0, 0.0],
+        rand_flip=False,
+        is_train=False,
+    ),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "num_pts_feats",
+            "num_views",
+            "timestamp",
+            "vehicle_type",
+            "city",
+        ],
+    ),
+]
+
+filter_cfg = dict(filter_frames_with_camera_order=camera_order)
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py
new file mode 100644
index 000000000..011e460f0
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py
@@ -0,0 +1,144 @@
+## This config is for the camera_base only model, without lidar points
+
+_base_ = [
+    "../default_lidar_50m.py",
+]
+input_modality = dict(use_lidar=True, use_camera=True)
+
+# Image parameters
+image_size = [384, 768]  # Height, Width
+camera_order = ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"]
+
+train_pipeline = [
+    dict(
+        type="BEVLoadMultiViewImageFromFiles",
+        to_float32=True,
+        color_type="color",
+        backend_args=_base_.backend_args,
+        camera_order=camera_order,
+    ),
+    # We keep loading LiDAR points to make downstream BEV augmentation easier
+    dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=_base_.point_load_dim,
+        use_dim=_base_.point_load_dim,
+        backend_args=_base_.backend_args,
+    ),
+    dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(
+        type="ImageAug3D",
+        final_dim=image_size,
+        resize_lim=[0.29, 0.35],
+        bot_pct_lim=[0.0, 0.0],
+        rot_lim=[0.0, 0.0],
+        rand_flip=True,
+        is_train=True,
+    ),
+    dict(type="PointsRangeFilter", point_cloud_range=[-80.0, -80.0, -10.0, 80.0, 80.0, 10.0]),
+    dict(
+        type="PointsToMultiViewImageDepths",
+        img_shape=image_size,
+        num_cameras=len(camera_order),
+        depth_bounds=[1.0, 60.0],
+        # visualize_dir="work_dirs/visualize_depths_6",
+    ),
+    dict(
+        type="BEVFusionGlobalRotScaleTrans",
+        scale_ratio_range=[0.95, 1.05],
+        rot_range=[-0.78539816, 0.78539816],
+        translation_std=[0.5, 0.5, 0.2],
+    ),
+    dict(type="BEVFusionRandomFlip3D"),
+    dict(type="ObjectRangeFilter", point_cloud_range=_base_.point_cloud_range),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=5),
+    # Remove LiDAR points from the data
+    dict(type="BEVFusionRemoveLiDARPoints"),
+    dict(
+        type="ObjectNameFilter",
+        classes=[
+            "car",
+            "truck",
+            "construction_vehicle",
+            "bus",
+            "trailer",
+            "barrier",
+            "motorcycle",
+            "bicycle",
+            "pedestrian",
+            "traffic_cone",
+        ],
+    ),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "transformation_3d_flow",
+            "pcd_rotation",
+            "pcd_scale_factor",
+            "pcd_trans",
+            "img_aug_matrix",
+            "lidar_aug_matrix",
+            "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
+            "gt_depths",
+        ],
+    ),
+]
+
+test_pipeline = [
+    dict(
+        type="BEVLoadMultiViewImageFromFiles",
+        to_float32=True,
+        color_type="color",
+        backend_args=_base_.backend_args,
+        camera_order=camera_order,
+    ),
+    dict(
+        type="ImageAug3D",
+        final_dim=image_size,
+        resize_lim=[0.32, 0.32],
+        bot_pct_lim=[0.0, 0.0],
+        rot_lim=[0.0, 0.0],
+        rand_flip=False,
+        is_train=False,
+    ),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "num_pts_feats",
+            "num_views",
+            "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
+        ],
+    ),
+]
+
+filter_cfg = dict(filter_frames_with_camera_order=camera_order)
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py
index 963a218e1..11e297c09 100644
--- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py
@@ -1,6 +1,6 @@
 # Dataset parameters
 backend_args = None
-num_workers = 32
+num_workers = 8
 input_modality = dict(use_lidar=True, use_camera=True)
 
 # range setting
@@ -13,6 +13,8 @@
     "bus": 120,
     "bicycle": 120,
     "pedestrian": 120,
+    "traffic_cone": 120,
+    "barrier": 120,
 }
 
 # LiDAR parameters
@@ -56,7 +58,7 @@
         final_dim=image_size,
         resize_lim=[0.29, 0.35],
         bot_pct_lim=[0.0, 0.0],
-        rot_lim=[-5.4, 5.4],
+        rot_lim=[0.0, 0.0],
         rand_flip=True,
         is_train=True,
     ),
@@ -69,21 +71,21 @@
     dict(type="BEVFusionRandomFlip3D"),
     dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range),
     dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range),
+    dict(type="BEVFusionRemoveLiDARPoints"),
     dict(
         type="ObjectNameFilter",
         classes=[
             "car",
             "truck",
-            "construction_vehicle",
             "bus",
-            "trailer",
-            "barrier",
-            "motorcycle",
             "bicycle",
             "pedestrian",
             "traffic_cone",
+            "barrier",
         ],
     ),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=2),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=1),
     dict(type="PointShuffle"),
     dict(
         type="Pack3DDetInputs",
@@ -107,6 +109,9 @@
             "img_aug_matrix",
             "lidar_aug_matrix",
             "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
         ],
     ),
 ]
@@ -134,7 +139,7 @@
         pad_empty_sweeps=True,
         remove_close=True,
         backend_args=backend_args,
-        test_mode=True,
+        test_mode=False,
     ),
     dict(
         type="ImageAug3D",
@@ -164,6 +169,9 @@
             "num_pts_feats",
             "num_views",
             "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
         ],
     ),
 ]
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py
index 09b9f7b26..d797779de 100644
--- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py
@@ -7,6 +7,9 @@
 point_cloud_range = [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0]
 voxel_size = [0.17, 0.17, 0.2]
 grid_size = [1440, 1440, 41]
+# Sparse dense output shapes
+sparse_dense_output_shapes = [180, 180, 2]
+
 eval_class_range = {
     "car": 120,
     "truck": 120,
@@ -63,6 +66,8 @@
             "barrier",
         ],
     ),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=5),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=3),
     dict(type="PointShuffle"),
     dict(
         type="Pack3DDetInputs",
@@ -143,4 +148,4 @@
 #   e.g., dict(filter_frames_with_missing_image=True).
 # - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so
 #   image-based filtering does not apply and `filter_cfg` is intentionally None.
-filter_cfg = None
+filter_cfg = dict()
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py
new file mode 100644
index 000000000..ca75d799f
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py
@@ -0,0 +1,148 @@
+# Dataset parameters
+backend_args = None
+num_workers = 4
+input_modality = dict(use_lidar=True, use_camera=False)
+
+# range setting
+point_cloud_range = [-54.0, -54.0, -3.0, 54.0, 54.0, 5.0]
+voxel_size = [0.075, 0.075, 0.2]
+grid_size = [1440, 1440, 41]
+eval_class_range = {
+    "car": 51.2,
+    "truck": 51.2,
+    "bus": 51.2,
+    "bicycle": 51.2,
+    "pedestrian": 51.2,
+    "traffic_cone": 51.2,
+    "barrier": 51.2,
+}
+
+# LiDAR parameters
+point_load_dim = 5  # x, y, z, intensity, ring_id
+point_use_dim = 4
+lidar_sweep_dims = [0, 1, 2, 4]  # x, y, z, time_lag
+sweeps_num = 1
+
+train_pipeline = [
+    dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=point_load_dim,
+        use_dim=point_load_dim,
+        backend_args=backend_args,
+    ),
+    dict(
+        type="LoadPointsFromMultiSweeps",
+        sweeps_num=sweeps_num,
+        load_dim=point_load_dim,
+        use_dim=lidar_sweep_dims,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args,
+        test_mode=False,
+    ),
+    dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(
+        type="BEVFusionGlobalRotScaleTrans",
+        scale_ratio_range=[0.95, 1.05],
+        rot_range=[-0.78539816, 0.78539816],
+        translation_std=[0.5, 0.5, 0.2],
+    ),
+    dict(type="BEVFusionRandomFlip3D"),
+    dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range),
+    dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range),
+    dict(
+        type="ObjectNameFilter",
+        classes=[
+            "car",
+            "truck",
+            "bus",
+            "bicycle",
+            "pedestrian",
+            "traffic_cone",
+            "barrier",
+        ],
+    ),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=2),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=1),
+    dict(type="PointShuffle"),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "transformation_3d_flow",
+            "pcd_rotation",
+            "pcd_scale_factor",
+            "pcd_trans",
+            "img_aug_matrix",
+            "lidar_aug_matrix",
+            "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
+        ],
+    ),
+]
+
+test_pipeline = [
+    dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=point_load_dim,
+        use_dim=point_load_dim,
+        backend_args=backend_args,
+    ),
+    dict(
+        type="LoadPointsFromMultiSweeps",
+        sweeps_num=sweeps_num,
+        load_dim=point_load_dim,
+        use_dim=lidar_sweep_dims,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args,
+        test_mode=True,
+    ),
+    dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "num_pts_feats",
+            "num_views",
+            "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
+        ],
+    ),
+]
+
+# Filtering configuration
+# Note:
+# - In camera–LiDAR configs, `filter_cfg` can enable image-based frame filtering,
+#   e.g., dict(filter_frames_with_missing_image=True).
+# - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so
+#   image-based filtering does not apply and `filter_cfg` is intentionally None.
+filter_cfg = dict()
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py
index e2de195e9..d5e426f58 100644
--- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py
@@ -1,12 +1,15 @@
 # Dataset parameters
 backend_args = None
-num_workers = 32
+num_workers = 16
 input_modality = dict(use_lidar=True, use_camera=False)
 
 # range setting
 point_cloud_range = [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0]
 voxel_size = [0.17, 0.17, 0.2]
 grid_size = [1440, 1440, 41]
+# Sparse dense output shapes
+sparse_dense_output_shapes = [180, 180, 2]
+
 eval_class_range = {
     "car": 120,
     "truck": 120,
@@ -63,6 +66,8 @@
             "barrier",
         ],
     ),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=2),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=1),
     dict(type="PointShuffle"),
     dict(
         type="Pack3DDetInputs",
@@ -143,4 +148,4 @@
 #   e.g., dict(filter_frames_with_missing_image=True).
 # - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so
 #   image-based filtering does not apply and `filter_cfg` is intentionally None.
-filter_cfg = None
+filter_cfg = dict()
diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py
similarity index 98%
rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py
rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py
index 64bc2b717..05740e442 100644
--- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py
+++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py
@@ -1,6 +1,6 @@
 # learning rate
-lr = 1e-4
-t_max = 6
+lr = 2e-4
+t_max = 2
 max_epochs = 20
 val_interval = 1
 
diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_cosine.py
similarity index 85%
rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_cosine.py
rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_cosine.py
index a2cd2d2e9..2893b2e74 100644
--- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_cosine.py
+++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_cosine.py
@@ -1,17 +1,16 @@
 # learning rate
-# 1e-4 * sqrt(2) = 0.0001414
-lr = 1.4141e-4
-t_max = 8
+lr = 2.0e-4
+t_max = 1
 max_epochs = 30
 val_interval = 5
 
-train_gpu_size = 8
-test_batch_size = 2
-train_batch_size = 8
+train_gpu_size = 2
+test_batch_size = 4
+train_batch_size = 32
 
 param_scheduler = [
     # learning rate scheduler
-    # During the first (max_epochs * 0.4) epochs, learning rate increases from 0 to lr * 10
+    # During the first (max_epochs * 0.10) epochs, learning rate increases from 0 to lr * 10
     # during the next epochs, learning rate decreases from lr * 10 to
     # lr * 1e-4
     dict(
@@ -33,7 +32,7 @@
         convert_to_iter_based=True,
     ),
     # momentum scheduler
-    # During the first (0.4 * max_epochs) epochs, momentum increases from 0 to 0.85 / 0.95
+    # During the first (max_epochs * 0.10) epochs, momentum increases from 0 to 0.85 / 0.95
     # during the next epochs, momentum increases from 0.85 / 0.95 to 1
     dict(
         type="CosineAnnealingMomentum",
diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_linear_cosine.py
new file mode 100644
index 000000000..95f5f96bd
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_linear_cosine.py
@@ -0,0 +1,69 @@
+# learning rate
+lr = 2e-4
+t_max = 3
+max_epochs = 30
+val_interval = 1
+
+train_gpu_size = 2
+test_batch_size = 2
+train_batch_size = 32
+
+param_scheduler = [
+    # learning rate scheduler
+    dict(
+        type="LinearLR",
+        start_factor=1.0 / 3,
+        begin=0,
+        end=t_max,
+        by_epoch=True,
+        convert_to_iter_based=True,
+    ),
+    dict(
+        type="CosineAnnealingLR",
+        T_max=(max_epochs - t_max),
+        eta_min=lr * 1e-4,
+        begin=t_max,
+        end=max_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True,
+    ),
+    # momentum scheduler
+    # During the first (0.4 * max_epochs) epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type="CosineAnnealingMomentum",
+        T_max=t_max,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=t_max,
+        by_epoch=True,
+        convert_to_iter_based=True,
+    ),
+    dict(
+        type="CosineAnnealingMomentum",
+        T_max=(max_epochs - t_max),
+        eta_min=1,
+        begin=t_max,
+        end=max_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True,
+    ),
+]
+
+train_cfg = dict(
+    by_epoch=True, max_epochs=max_epochs, val_interval=val_interval, dynamic_intervals=[(max_epochs - 5, 1)]
+)
+val_cfg = dict()
+test_cfg = dict()
+
+optim_wrapper = dict(
+    type="OptimWrapper",
+    optimizer=dict(type="AdamW", lr=lr, weight_decay=1e-2),
+    clip_grad=dict(max_norm=5.0, norm_type=2),
+)
+
+auto_scale_lr = dict(enable=False, base_batch_size=train_gpu_size * train_batch_size)
+
+# Only set if the number of train_gpu_size more than 1
+if train_gpu_size > 1:
+    sync_bn = "torch"
diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb8_adamw_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_cosine.py
similarity index 86%
rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb8_adamw_cosine.py
rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_cosine.py
index 87571d0b3..d209d0c1b 100644
--- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb8_adamw_cosine.py
+++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_cosine.py
@@ -1,17 +1,16 @@
 # learning rate
-# 1e-4 * sqrt(2) = 0.0001414
-lr = 1.4141e-4
-t_max = 15
+lr = 2.0e-4
+t_max = 5
 max_epochs = 50
 val_interval = 5
 
 train_gpu_size = 8
-test_batch_size = 2
-train_batch_size = 8
+test_batch_size = 4
+train_batch_size = 16
 
 param_scheduler = [
     # learning rate scheduler
-    # During the first (max_epochs * 0.4) epochs, learning rate increases from 0 to lr * 10
+    # During the first (max_epochs * 0.10) epochs, learning rate increases from 0 to lr * 10
     # during the next epochs, learning rate decreases from lr * 10 to
     # lr * 1e-4
     dict(
@@ -33,7 +32,7 @@
         convert_to_iter_based=True,
     ),
     # momentum scheduler
-    # During the first (0.4 * max_epochs) epochs, momentum increases from 0 to 0.85 / 0.95
+    # During the first (0.10 * max_epochs) epochs, momentum increases from 0 to 0.85 / 0.95
     # during the next epochs, momentum increases from 0.85 / 0.95 to 1
     dict(
         type="CosineAnnealingMomentum",
diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_linear_cosine.py
similarity index 96%
rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py
rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_linear_cosine.py
index 23d29acc1..43715fed7 100644
--- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py
+++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_linear_cosine.py
@@ -1,8 +1,8 @@
 # learning rate
-lr = 1e-4
-t_max = 8
-max_epochs = 30
-val_interval = 1
+lr = 2e-4
+t_max = 5
+max_epochs = 50
+val_interval = 5
 
 train_gpu_size = 8
 test_batch_size = 2
diff --git a/projects/BEVFusion/deploy/containers.py b/projects/BEVFusion/deploy/containers.py
index 51f2316df..ad9243412 100644
--- a/projects/BEVFusion/deploy/containers.py
+++ b/projects/BEVFusion/deploy/containers.py
@@ -1,8 +1,8 @@
+# Wrapper Classes for onnx conversion
+import numpy as np
 import torch
 import torch.nn.functional as F
 
-# Wrapper Classes for onnx conversion
-
 
 class TrtBevFusionImageBackboneContainer(torch.nn.Module):
     def __init__(self, mod, mean, std) -> None:
diff --git a/projects/BEVFusion/deploy/exporter.py b/projects/BEVFusion/deploy/exporter.py
index b1a430eb9..c7cd9e6b0 100644
--- a/projects/BEVFusion/deploy/exporter.py
+++ b/projects/BEVFusion/deploy/exporter.py
@@ -2,7 +2,7 @@
 
 import logging
 import os.path as osp
-from typing import Optional
+from typing import Any, Optional
 
 import numpy as np
 import onnx
@@ -11,12 +11,32 @@
 from builder import ExportBuilder
 from containers import TrtBevFusionCameraOnlyContainer, TrtBevFusionImageBackboneContainer, TrtBevFusionMainContainer
 from data_classes import ModelData, SetupConfigs
-from mmdeploy.core import RewriterContext
+from mmdeploy.core import SYMBOLIC_REWRITER, RewriterContext
 from mmdeploy.utils import (
     get_root_logger,
 )
 
 
+def purge_mmdeploy_symbolics(op_names: list[str]) -> dict:
+    """Delete mmdeploy's symbolic records for the given op names.
+    Both the op-name key (e.g. `"layer_norm"`) and the function-path
+    bookkeeping key (e.g. `"mmdeploy.pytorch.symbolics.layer_norm.layer_norm__default"`)
+    are removed. Returns a snapshot of what was deleted for optional restore.
+    """
+    records = SYMBOLIC_REWRITER._registry._rewrite_records
+    removed: dict = {}
+    for key in list(records.keys()):
+        # Primary key: the aten op name itself.
+        if key in op_names:
+            removed[key] = records.pop(key)
+            continue
+        # Bookkeeping key: full Python path of an implementer function.
+        # Match by "...symbolics.<op_name>." or "...symbolics.<op_name>__"
+        if any(f".symbolics.{op}." in key or f".symbolics.{op}__" in key for op in op_names):
+            removed[key] = records.pop(key)
+    return removed
+
+
 class Torch2OnnxExporter:
 
     def __init__(self, setup_configs: SetupConfigs, log_level: str):
@@ -62,6 +82,10 @@ def _export_model(
           patched_model (torch.nn.Module): Patched Pytorch model.
           ir_configs (dict): Configs for intermediate representations in ONNX.
         """
+        # Purge the mmdeploy symbolic records for the layer_norm op, remove this if LayerNorm OP is not supported
+        # in the tensorrt version
+        removed = purge_mmdeploy_symbolics(["layer_norm"])
+        self.logger.info(f"Purged {len(removed)} mmdeploy symbolic records: {list(removed.keys())}")
         with RewriterContext(**context_info), torch.no_grad():
             image_feats = None
             if "img_backbone" in self.setup_configs.model_cfg.model:
diff --git a/projects/BEVFusion/setup.py b/projects/BEVFusion/setup.py
index 38f588b20..52d397c12 100644
--- a/projects/BEVFusion/setup.py
+++ b/projects/BEVFusion/setup.py
@@ -54,6 +54,14 @@ def make_cuda_ext(name, module, sources, sources_cuda=[], extra_args=[], extra_i
                     "src/bev_pool_cuda.cu",
                 ],
             ),
+            make_cuda_ext(
+                name="bev_pool_v2_ext",
+                module="projects.BEVFusion.bevfusion.ops.bev_pool_v2",
+                sources=[
+                    "src/bev_pool.cpp",
+                    "src/bev_pool_cuda.cu",
+                ],
+            ),
             make_cuda_ext(
                 name="voxel_layer",
                 module="projects.BEVFusion.bevfusion.ops.voxel",