diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/base.py b/autoware_ml/configs/detection3d/dataset/t4dataset/base.py index 3be587072..7f4be6293 100644 --- a/autoware_ml/configs/detection3d/dataset/t4dataset/base.py +++ b/autoware_ml/configs/detection3d/dataset/t4dataset/base.py @@ -29,6 +29,9 @@ "db_j6gen2_v7", "db_j6gen2_v8", "db_j6gen2_v9", + "db_j6gen2_v10", + "db_j6gen2_v11", + "db_j6gen2_v12", "db_largebus_v1", "db_largebus_v2", "db_largebus_v3", @@ -146,6 +149,8 @@ "traffic_cone": "traffic_cone", "trafficcone": "traffic_cone", "barrier": "barrier", + "other_vehicle": "car", + "other_pedestrian": "pedestrian", } class_names = ["car", "truck", "bus", "bicycle", "pedestrian", "traffic_cone", "barrier"] diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2.py b/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2.py index 0324e7207..a87166019 100644 --- a/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2.py +++ b/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2.py @@ -27,6 +27,9 @@ "db_j6gen2_v7", "db_j6gen2_v8", "db_j6gen2_v9", + "db_j6gen2_v10", + "db_j6gen2_v11", + "db_j6gen2_v12", ] dataset_test_groups = { @@ -127,6 +130,8 @@ "traffic_cone": "traffic_cone", "trafficcone": "traffic_cone", "barrier": "barrier", + "other_vehicle": "car", + "other_pedestrian": "pedestrian", } class_names = [ diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py b/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py index b9ec03f27..ef0141a5b 100644 --- a/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py +++ b/autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py @@ -27,6 +27,9 @@ "db_j6gen2_v7", "db_j6gen2_v8", "db_j6gen2_v9", + "db_j6gen2_v10", + "db_j6gen2_v11", + "db_j6gen2_v12", "db_largebus_v1", "db_largebus_v2", "db_largebus_v3", @@ -133,6 +136,8 @@ "traffic_cone": "traffic_cone", "trafficcone": "traffic_cone", "barrier": "barrier", + "other_vehicle": "car", + "other_pedestrian": "pedestrian", } diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py b/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py index c08decfa1..0f00a651d 100644 --- a/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py +++ b/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py @@ -123,6 +123,8 @@ "traffic_cone": "traffic_cone", "trafficcone": "traffic_cone", "barrier": "barrier", + "other_vehicle": "car", + "other_pedestrian": "pedestrian", } class_names = [ diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_gen2.py b/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_gen2.py index dbd6e2813..9995cd9b7 100644 --- a/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_gen2.py +++ b/autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_gen2.py @@ -120,6 +120,8 @@ "traffic_cone": "traffic_cone", "trafficcone": "traffic_cone", "barrier": "barrier", + "other_vehicle": "car", + "other_pedestrian": "pedestrian", } class_names = [ diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/largebus.py b/autoware_ml/configs/detection3d/dataset/t4dataset/largebus.py index 2212b8e56..cd42362b5 100644 --- a/autoware_ml/configs/detection3d/dataset/t4dataset/largebus.py +++ b/autoware_ml/configs/detection3d/dataset/t4dataset/largebus.py @@ -122,6 +122,8 @@ "traffic_cone": "traffic_cone", "trafficcone": "traffic_cone", "barrier": "barrier", + "other_vehicle": "car", + "other_pedestrian": "pedestrian", } class_names = [ diff --git a/autoware_ml/configs/detection3d/default_runtime.py b/autoware_ml/configs/detection3d/default_runtime.py index cc2b896f7..6da761425 100644 --- a/autoware_ml/configs/detection3d/default_runtime.py +++ b/autoware_ml/configs/detection3d/default_runtime.py @@ -2,9 +2,17 @@ default_hooks = dict( timer=dict(type="IterTimerHook"), - logger=dict(type="LoggerHook", interval=50), + logger=dict( + type="LoggerHook", + interval=50, + backend_args=dict(backend="local"), + ), param_scheduler=dict(type="ParamSchedulerHook"), - checkpoint=dict(type="CheckpointHook", interval=-1), + checkpoint=dict( + type="CheckpointHook", + interval=-1, + backend_args=dict(backend="local"), + ), sampler_seed=dict(type="DistSamplerSeedHook"), visualization=dict(type="Det3DVisualizationHook"), ) diff --git a/autoware_ml/detection3d/datasets/t4dataset.py b/autoware_ml/detection3d/datasets/t4dataset.py index ce1c78f31..38c5f69dd 100644 --- a/autoware_ml/detection3d/datasets/t4dataset.py +++ b/autoware_ml/detection3d/datasets/t4dataset.py @@ -2,6 +2,7 @@ from typing import List import numpy as np +import tqdm from mmdet3d.datasets import NuScenesDataset from mmengine.logging import print_log from mmengine.registry import DATASETS @@ -56,16 +57,14 @@ def filter_data(self) -> List[dict]: return self.data_list filtered_data_list = [] - for entry in self.data_list: + for entry in tqdm.tqdm(self.data_list, desc="Filtering data"): filtered = False for camera_order in filter_frames_with_camera_order: if camera_order not in entry["images"]: filtered = True break - if entry["images"][camera_order]["img_path"] is None or not osp.exists( - entry["images"][camera_order]["img_path"] - ): + if entry["images"][camera_order]["img_path"] is None: filtered = True break @@ -180,6 +179,7 @@ def parse_data_info(self, info: dict) -> dict: cam_prefix, img_info["img_path"], ) + # print_log(f"Camera path: {img_info['img_path']}", logger="current") if self.default_cam_key is not None: info["img_path"] = info["images"][self.default_cam_key]["img_path"] @@ -192,4 +192,7 @@ def parse_data_info(self, info: dict) -> dict: else: info["lidar2img"] = info["cam2img"] @ info["lidar2cam"] + # Default difficulty to 0 if not present + if "difficulty" not in info: + info["difficulty"] = 0 return info diff --git a/autoware_ml/detection3d/datasets/transforms/__init__.py b/autoware_ml/detection3d/datasets/transforms/__init__.py index 6bc932f1a..a63ff1eea 100644 --- a/autoware_ml/detection3d/datasets/transforms/__init__.py +++ b/autoware_ml/detection3d/datasets/transforms/__init__.py @@ -1,3 +1,4 @@ +from .local_3d_bbox import Local3DBBoxExpand from .object_min_points_filter import ObjectMinPointsFilter -__all__ = ["ObjectMinPointsFilter"] +__all__ = ["ObjectMinPointsFilter", "Local3DBBoxExpand"] diff --git a/autoware_ml/detection3d/datasets/transforms/local_3d_bbox.py b/autoware_ml/detection3d/datasets/transforms/local_3d_bbox.py new file mode 100644 index 000000000..ae06d4005 --- /dev/null +++ b/autoware_ml/detection3d/datasets/transforms/local_3d_bbox.py @@ -0,0 +1,58 @@ +from typing import List + +import numpy as np +from mmcv.transforms import BaseTransform +from mmdet3d.structures.ops import box_np_ops +from mmengine.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class Local3DBBoxExpand(BaseTransform): + """Locally expand the 3D bounding boxes by scaling the width, which it doesn't scale the points. + + Args: + expand_widths: (List[float]): Uniformly sampled expand width. + width_dim: (int): The dimension of the width. Default is 4, which is the width dimension of the 3D + bounding box. Since 3D Bbox is in the format of [x, y, z, dx, dy, dz, heading], the width dimension is the + 4th dimension. + label_ids: (List[int]): The label IDs to expand. If None, all label IDs will be expanded. + """ + + def __init__(self, expand_widths: List[float], width_dim: int = 4, label_ids: List[int] = None) -> None: + assert isinstance(expand_widths, list) + assert len(expand_widths) == 2 + assert expand_widths[0] < expand_widths[1] + self.expand_widths = expand_widths + self.width_dim = width_dim + self.label_ids = label_ids + + def transform(self, input_dict: dict) -> dict: + """Call function to locally augment the 3D bounding boxes by scaling the width. + + Args: + input_dict (dict): Result dict from loading pipeline. + + Returns: + dict: Results after locally augmenting the 3D bounding boxes by scaling the width, 'gt_bboxes_3d' \ + key is updated in the result dict. + """ + # Label mask + if self.label_ids is not None: + label_masks = [True if label in self.label_ids else False for label in input_dict["gt_labels_3d"]] + else: + label_masks = np.ones(len(input_dict["gt_labels_3d"]), dtype=bool) + + for i in range(len(input_dict["gt_bboxes_3d"])): + if not label_masks[i]: + continue + + expand_width = np.random.uniform(self.expand_widths[0], self.expand_widths[1]) + input_dict["gt_bboxes_3d"].tensor[i, self.width_dim] += expand_width + + return input_dict + + def __repr__(self) -> str: + """str: Return a string that describes the module.""" + repr_str = self.__class__.__name__ + repr_str += f"(expand_widths={self.expand_widths}, width_dim={self.width_dim}, label_ids={self.label_ids})" + return repr_str diff --git a/projects/BEVFusion/bevfusion/__init__.py b/projects/BEVFusion/bevfusion/__init__.py index 60a64b532..c4afb6a86 100644 --- a/projects/BEVFusion/bevfusion/__init__.py +++ b/projects/BEVFusion/bevfusion/__init__.py @@ -1,11 +1,19 @@ from .bevfusion import BEVFusion from .bevfusion_head import BEVFusionHead, ConvFuser from .bevfusion_necks import GeneralizedLSSFPN +from .bevfusion_voxel_encoder import BEVFusionVoxelFeatureNet, HardSimpleVoxelSinCosEncoder from .depth_lss import DepthLSSTransform, LSSTransform -from .loading import BEVLoadMultiViewImageFromFiles +from .depth_lss_v2 import LSSTransformV2, LSSTransformV2DepthAware +from .loading import BEVLoadMultiViewImageFromFiles, PointsToMultiViewImageDepths from .sparse_encoder import BEVFusionSparseEncoder from .transformer import TransformerDecoderLayer -from .transforms_3d import BEVFusionGlobalRotScaleTrans, BEVFusionRandomFlip3D, GridMask, ImageAug3D +from .transforms_3d import ( + BEVFusionGlobalRotScaleTrans, + BEVFusionRandomFlip3D, + BEVFusionRemoveLiDARPoints, + GridMask, + ImageAug3D, +) from .utils import BBoxBEVL1Cost, HeuristicAssigner3D, HungarianAssigner3D, IoU3DCost, TransFusionBBoxCoder __all__ = [ @@ -26,5 +34,11 @@ "TransformerDecoderLayer", "BEVFusionRandomFlip3D", "BEVFusionGlobalRotScaleTrans", + "BEVFusionRemoveLiDARPoints", "TransFusionBBoxCoder", + "HardSimpleVoxelSinCosEncoder", + "BEVFusionVoxelFeatureNet", + "LSSTransformV2", + "PointsToMultiViewImageDepths", + "LSSTransformV2DepthAware", ] diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index 243b3beb5..2eb3ef3d3 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -1,7 +1,10 @@ +import math from collections import OrderedDict from copy import deepcopy +from pathlib import Path from typing import Dict, List, Optional, Tuple +import matplotlib.pyplot as plt import numpy as np import torch import torch.distributed as dist @@ -9,6 +12,7 @@ from mmdet3d.registry import MODELS from mmdet3d.structures import Det3DDataSample from mmdet3d.utils import OptConfigType, OptMultiConfig, OptSampleList +from mmengine.logging import print_log from mmengine.utils import is_list_of from torch import Tensor from torch.nn import functional as F @@ -34,6 +38,9 @@ def __init__( bbox_head: Optional[dict] = None, init_cfg: OptMultiConfig = None, seg_head: Optional[dict] = None, + loss_depth_weight: float = 3.0, + depth_gt_downsample: int = 1, + visualize_gt_depth_dir: Optional[str] = None, **kwargs, ) -> None: """Initialize BEVFusion model. @@ -56,12 +63,10 @@ def __init__( super().__init__(data_preprocessor=data_preprocessor, init_cfg=init_cfg) if voxelize_cfg is not None: - self.voxelize_reduce = voxelize_cfg.pop("voxelize_reduce") self.pts_voxel_layer = Voxelization(**voxelize_cfg) self.pts_voxel_encoder = MODELS.build(pts_voxel_encoder) self.pts_middle_encoder = MODELS.build(pts_middle_encoder) else: - self.voxelize_reduce = False self.pts_voxel_layer = None self.pts_voxel_encoder = None self.pts_middle_encoder = None @@ -76,8 +81,12 @@ def __init__( self.pts_neck = MODELS.build(pts_neck) if pts_neck is not None else None self.bbox_head = MODELS.build(bbox_head) - - self.init_weights() + self._weights_initialized = False + self.loss_depth_weight = loss_depth_weight + self.depth_gt_downsample = depth_gt_downsample + self.visualize_gt_depth_dir = Path(visualize_gt_depth_dir) if visualize_gt_depth_dir is not None else None + if self.visualize_gt_depth_dir is not None: + self.visualize_gt_depth_dir.mkdir(parents=True, exist_ok=True) def _forward( self, batch_inputs_dict: Tensor, batch_data_samples: OptSampleList = [], using_image_features=False, **kwargs @@ -133,8 +142,11 @@ def parse_losses(self, losses: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, D return loss, log_vars # type: ignore def init_weights(self) -> None: + if self._weights_initialized: + return if self.img_backbone is not None: self.img_backbone.init_weights() + self._weights_initialized = True @property def with_bbox_head(self): @@ -146,6 +158,53 @@ def with_seg_head(self): """bool: Whether the detector has a segmentation head.""" return hasattr(self, "seg_head") and self.seg_head is not None + def prepare_camera_depth_aware_parameters( + self, + camera_intrinsics: torch.Tensor, + img_aug_matrix: torch.Tensor, + lidar_aug_matrix: torch.Tensor, + camera2lidar: torch.Tensor, + ) -> torch.Tensor: + """ + Args: + camera_intrinsics: torch.Tensor, the camera intrinsics of shape (B, N, 3, 3). + img_aug_matrix: torch.Tensor, the image augmentation matrix of shape (B, N, 4, 4). + lidar_aug_matrix: torch.Tensor, the lidar augmentation matrix of shape (B, 4, 4). + camera2lidar: torch.Tensor, the camera to lidar matrix of shape (B, N, 4, 4). + Returns: + torch.Tensor, the camera depth aware parameters of shape (B*N, N_CAMERA_DEPTH_PARAMETERS). + """ + B, N, _, _ = camera_intrinsics.shape + lidar_aug_matrix = lidar_aug_matrix.view(B, 1, 4, 4).repeat(1, N, 1, 1) + + # (B*N, 15) + mlp_input = torch.stack( + [ + camera_intrinsics[:, :, 0, 0], # fx + camera_intrinsics[:, :, 1, 1], # fy + camera_intrinsics[:, :, 0, 2], # cx + camera_intrinsics[:, :, 1, 2], # cy + img_aug_matrix[:, :, 0, 0], # r11 + img_aug_matrix[:, :, 0, 1], # r12 + img_aug_matrix[:, :, 0, 3], # t1 + img_aug_matrix[:, :, 1, 0], # r21 + img_aug_matrix[:, :, 1, 1], # r22 + img_aug_matrix[:, :, 1, 3], # t2 + lidar_aug_matrix[:, :, 0, 0], # r11 + lidar_aug_matrix[:, :, 0, 1], # r12 + lidar_aug_matrix[:, :, 1, 0], # r21 + lidar_aug_matrix[:, :, 1, 1], # r22 + lidar_aug_matrix[:, :, 2, 2], # r33 + ], + dim=-1, + ) + # (B, N, 4, 4) -> (B, N, 3, 4) -> (B*N, 12) + camera2lidar_flatten = camera2lidar[:, :, :3, :].view(B, N, -1) + + # (B, N, 15+12) + mlp_input = torch.cat([mlp_input, camera2lidar_flatten], dim=-1) + return mlp_input + def get_image_backbone_features(self, x: torch.Tensor) -> torch.Tensor: B, N, C, H, W = x.size() x = x.view(B * N, C, H, W).contiguous() @@ -176,14 +235,15 @@ def extract_img_feat( lidar_aug_matrix_inverse=None, geom_feats=None, using_image_features=False, - ) -> torch.Tensor: + camera_depth_aware_parameters=None, + ) -> Tuple[torch.Tensor, torch.Tensor]: if not using_image_features: x = self.get_image_backbone_features(x) - with torch.cuda.amp.autocast(enabled=False): + with torch.amp.autocast("cuda", enabled=False): # with torch.autocast(device_type='cuda', dtype=torch.float32): - x = self.view_transform( + x, pred_depths = self.view_transform( x, points, lidar2image, @@ -196,29 +256,32 @@ def extract_img_feat( img_aug_matrix_inverse, lidar_aug_matrix_inverse, geom_feats, + camera_depth_aware_parameters=camera_depth_aware_parameters, ) - return x + return x, pred_depths def extract_pts_feat(self, feats, coords, sizes, points=None) -> torch.Tensor: if points is not None: # NOTE(knzo25): training and normal inference - with torch.cuda.amp.autocast(enabled=False): + with torch.amp.autocast("cuda", enabled=False): # with torch.autocast('cuda', enabled=False): points = [point.float() for point in points] feats, coords, sizes = self.voxelize(points) batch_size = coords[-1, 0] + 1 else: - # NOTE(knzo25): onnx inference. Voxelization happens outside the graph - with torch.cuda.amp.autocast(enabled=False): + # NOTE: (knzo25): onnx inference. Voxelization happens outside the graph + with torch.amp.autocast("cuda", enabled=False): + # with torch.autocast('cuda', enabled=False): + + # NOTE(knzo25): onnx demmands this + # batch_size = coords[-1, 0] + 1 # with torch.autocast('cuda', enabled=False): # NOTE(knzo25): onnx demmands this # batch_size = coords[-1, 0] + 1 batch_size = 1 print("Run onnx point_eSpConvst") - assert self.voxelize_reduce - if self.voxelize_reduce: - feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view(-1, 1) + feats = self.pts_voxel_encoder(feats, sizes, coords) x = self.pts_middle_encoder(feats, coords, batch_size) return x @@ -241,11 +304,12 @@ def voxelize(self, points): feats = torch.cat(feats, dim=0) coords = torch.cat(coords, dim=0) - if len(sizes) > 0: - sizes = torch.cat(sizes, dim=0) - if self.voxelize_reduce: - feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view(-1, 1) - feats = feats.contiguous() + assert len(sizes) > 0, "No points in the voxel" + sizes = torch.cat(sizes, dim=0) + + # if self.voxelize_reduce: + # feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view(-1, 1) + # feats = feats.contiguous() return feats, coords, sizes @@ -281,7 +345,7 @@ def predict( contains a tensor with shape (num_instances, 7). """ batch_input_metas = [item.metainfo for item in batch_data_samples] - feats = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features) + feats, _ = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features) if self.with_bbox_head: outputs = self.bbox_head.predict(feats, batch_input_metas) @@ -302,6 +366,7 @@ def extract_feat( features = [] is_onnx_inference = False + pred_depths = None if imgs is not None and "lidar2img" not in batch_inputs_dict: # NOTE(knzo25): normal training and testing imgs = imgs.contiguous() @@ -319,7 +384,13 @@ def extract_feat( camera2lidar = imgs.new_tensor(np.asarray(camera2lidar)) img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix)) lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix)) - img_feature = self.extract_img_feat( + camera_depth_aware_parameters = self.prepare_camera_depth_aware_parameters( + camera_intrinsics=camera_intrinsics, + img_aug_matrix=img_aug_matrix, + lidar_aug_matrix=lidar_aug_matrix, + camera2lidar=camera2lidar, + ) + img_feature, pred_depths = self.extract_img_feat( imgs, deepcopy(points), lidar2image, @@ -329,6 +400,7 @@ def extract_feat( lidar_aug_matrix, batch_input_metas, using_image_features=using_image_features, + camera_depth_aware_parameters=camera_depth_aware_parameters, ) features.append(img_feature) elif imgs is not None: @@ -340,8 +412,10 @@ def extract_feat( img_aug_matrix = batch_inputs_dict["img_aug_matrix"] lidar_aug_matrix = batch_inputs_dict["lidar_aug_matrix"] geom_feats = batch_inputs_dict["geom_feats"] + # Retrieve the parameters from deployment code directly + camera_depth_aware_parameters = batch_inputs_dict["camera_depth_aware_parameters"] - img_feature = self.extract_img_feat( + img_feature, pred_depths = self.extract_img_feat( imgs, points, lidar2image, @@ -352,6 +426,7 @@ def extract_feat( batch_input_metas, geom_feats=geom_feats, using_image_features=using_image_features, + camera_depth_aware_parameters=camera_depth_aware_parameters, ) features.append(img_feature) @@ -376,7 +451,7 @@ def extract_feat( if self.pts_neck is not None: x = self.pts_neck(x) - return x + return x, pred_depths def loss( self, @@ -386,12 +461,145 @@ def loss( **kwargs, ) -> List[Det3DDataSample]: batch_input_metas = [item.metainfo for item in batch_data_samples] - feats = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features) + feats, pred_depths = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features) losses = dict() + if self.loss_depth_weight > 0 and pred_depths is not None: + with torch.amp.autocast("cuda", enabled=False): + gt_depths = torch.stack( + [ + ( + meta["gt_depths"] + if isinstance(meta["gt_depths"], torch.Tensor) + else torch.as_tensor(meta["gt_depths"]) + ) + for meta in batch_input_metas + ] + ).to(device=pred_depths.device, dtype=torch.float32) + depth_loss = self.get_depth_loss(gt_depths, pred_depths) + losses["loss_depth"] = depth_loss + if self.with_bbox_head: bbox_loss = self.bbox_head.loss(feats, batch_data_samples) - - losses.update(bbox_loss) + losses.update(bbox_loss) return losses + + def _visualize_one_hot_gt_depth( + self, + gt_depths_one_hot: Tensor, + batch_size: int, + num_cameras: int, + height: int, + width: int, + batch_idx: int = 0, + num_channels: int = 6, + ) -> None: + """Save one-hot depth GT maps for the first batch and first few depth channels. + + Args: + gt_depths_one_hot (Tensor): One-hot depth GT of shape [B*N*H*W, D]. + batch_size (int): Batch size B from the original input. + num_cameras (int): Number of camera views N from the original input. + height (int): Original input height H before downsampling. + width (int): Original input width W before downsampling. + batch_idx (int): Batch index to visualize. + num_channels (int): Number of depth-bin channels to visualize. + """ + if self.visualize_gt_depth_dir is None: + return + + if dist.is_available() and dist.is_initialized() and dist.get_rank() != 0: + return + + if batch_size <= batch_idx or num_cameras == 0: + return + + downsample = self.depth_gt_downsample + height_down = height // downsample + width_down = width // downsample + num_depth_bins = gt_depths_one_hot.shape[1] + + num_channels = min(num_channels, num_depth_bins) + if num_channels == 0 or height_down == 0 or width_down == 0: + return + + with torch.no_grad(): + one_hot = gt_depths_one_hot.view(batch_size, num_cameras, height_down, width_down, num_depth_bins) + depth_channels = one_hot[batch_idx, 0, :, :, :num_channels].detach().float().cpu().numpy() + + ncols = min(3, num_channels) + nrows = math.ceil(num_channels / ncols) + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows), squeeze=False) + + dbounds = self.view_transform.dbound + for ch_idx in range(num_channels): + ax = axes[ch_idx // ncols, ch_idx % ncols] + channel_map = depth_channels[:, :, ch_idx] + depth_m = dbounds[0] + (ch_idx + 0.5) * dbounds[2] + im = ax.imshow(channel_map, cmap="viridis", vmin=0, vmax=1, interpolation="nearest") + ax.set_title(f"batch {batch_idx}, depth bin {ch_idx} (~{depth_m:.1f}m)") + ax.set_xticks([]) + ax.set_yticks([]) + fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04) + + for ch_idx in range(num_channels, nrows * ncols): + axes[ch_idx // ncols, ch_idx % ncols].axis("off") + + fig.suptitle(f"one-hot gt_depth (batch={batch_idx}, cam=0, bins=0-{num_channels - 1})") + fig.tight_layout() + + if not hasattr(self, "_gt_depth_one_hot_vis_count"): + self._gt_depth_one_hot_vis_count = 0 + self._gt_depth_one_hot_vis_count += 1 + save_path = self.visualize_gt_depth_dir / f"gt_depth_one_hot_{self._gt_depth_one_hot_vis_count:06d}.png" + fig.savefig(save_path, dpi=150, bbox_inches="tight") + plt.close(fig) + print_log(f"Saved one-hot gt_depth visualization to {save_path.resolve()}") + + def get_downsampled_gt_depth(self, gt_depths): + """ + Input: + gt_depths: [B, N, H, W] + Output: + gt_depths: [B*N*h*w, d] + """ + B, N, H, W = gt_depths.shape + D = self.view_transform.D + dbounds = self.view_transform.dbound + gt_depths = gt_depths.view( + B * N, + H // self.depth_gt_downsample, + self.depth_gt_downsample, + W // self.depth_gt_downsample, + self.depth_gt_downsample, + 1, + ) + gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous() + gt_depths = gt_depths.view(-1, self.depth_gt_downsample * self.depth_gt_downsample) + gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths) + gt_depths = torch.min(gt_depths_tmp, dim=-1).values + gt_depths = gt_depths.view(B * N, H // self.depth_gt_downsample, W // self.depth_gt_downsample) + + gt_depths = (gt_depths - (dbounds[0] - dbounds[2])) / dbounds[2] + # gt_depths = torch.where(gt_depths >= 0.0, gt_depths, torch.zeros_like(gt_depths)) + # gt_depths = torch.clamp(gt_depths, max=float(D)) + gt_depths = torch.where((gt_depths >= 0.0) & (gt_depths < D + 1), gt_depths, torch.zeros_like(gt_depths)) + # gt_depths = torch.clamp(gt_depths, max=float(D)) + gt_depths = F.one_hot(gt_depths.long(), num_classes=D + 1).view(-1, D + 1)[:, 1:] + self._visualize_one_hot_gt_depth(gt_depths, B, N, H, W) + return gt_depths.float() + + def get_depth_loss(self, depth_labels, depth_preds): + depth_labels = self.get_downsampled_gt_depth(depth_labels) + # (B, N, D, H, W) -> (B*N*H*W, D) + depth_preds = depth_preds.permute(0, 1, 3, 4, 2).contiguous().view(-1, self.view_transform.D) + fg_mask = torch.max(depth_labels, dim=1).values > 0.0 + depth_labels = depth_labels[fg_mask] + depth_preds = depth_preds[fg_mask] + depth_loss = F.binary_cross_entropy( + depth_preds, + depth_labels, + reduction="none", + ).sum() / max(1.0, fg_mask.sum()) + return self.loss_depth_weight * depth_loss diff --git a/projects/BEVFusion/bevfusion/bevfusion_head.py b/projects/BEVFusion/bevfusion/bevfusion_head.py index c37c5a538..8dc4bce45 100644 --- a/projects/BEVFusion/bevfusion/bevfusion_head.py +++ b/projects/BEVFusion/bevfusion/bevfusion_head.py @@ -17,6 +17,8 @@ from mmengine.structures import InstanceData from torch import nn +from .ops.topk.topk import topk + def clip_sigmoid(x, eps=1e-4): y = torch.clamp(x.sigmoid_(), min=eps, max=1 - eps) @@ -26,11 +28,18 @@ def clip_sigmoid(x, eps=1e-4): @MODELS.register_module() class ConvFuser(nn.Sequential): - def __init__(self, in_channels: int, out_channels: int, kernel_size: int, padding: int) -> None: + def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int, padding: int) -> None: self.in_channels = in_channels self.out_channels = out_channels super().__init__( - nn.Conv2d(sum(in_channels), out_channels, kernel_size, padding, bias=False), + nn.Conv2d( + sum(in_channels), + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=False, + ), nn.BatchNorm2d(out_channels), nn.ReLU(True), ) @@ -63,6 +72,7 @@ def __init__( norm_cfg=dict(type="BN1d"), bias="auto", # loss + loss_iou=None, loss_cls=dict(type="mmdet.GaussianFocalLoss", reduction="mean"), loss_bbox=dict(type="mmdet.L1Loss", reduction="mean"), loss_heatmap=dict(type="mmdet.GaussianFocalLoss", reduction="mean"), @@ -88,8 +98,10 @@ def __init__( if not self.use_sigmoid_cls: self.num_classes += 1 self.loss_cls = MODELS.build(loss_cls) + self.loss_iou = MODELS.build(loss_iou) if loss_iou is not None else None self.loss_bbox = MODELS.build(loss_bbox) self.loss_heatmap = MODELS.build(loss_heatmap) + self.share_conv_out_channels = hidden_channel self.bbox_coder = build_bbox_coder(bbox_coder) self.sampling = False @@ -155,7 +167,11 @@ def __init__( # Position Embedding for Cross-Attention, which is re-used during training # noqa: E501 x_size = self.test_cfg["grid_size"][0] // self.test_cfg["out_size_factor"] y_size = self.test_cfg["grid_size"][1] // self.test_cfg["out_size_factor"] - self.bev_pos = self.create_2D_grid(x_size, y_size) + self.spatial_dim = x_size * y_size + bev_pos = self.create_2D_grid(x_size, y_size) + + # Register the bev_pos as a buffer so it moves to the GPU automatically. + self.register_buffer("bev_pos", bev_pos, persistent=False) # (1, H * W, 2) self.img_feat_pos = None self.img_feat_collapsed_pos = None @@ -174,11 +190,22 @@ def __init__( self.dense_heatmap_exclude_pooling_classes = sorted( list(set(self.class_name_to_indices.values()) - set(self.dense_heatmap_pooling_class_indices)) ) + # Pre-compute the correct order of the classes for the final local_max + heatmap_concat_order = ( + self.dense_heatmap_pooling_class_indices + self.dense_heatmap_exclude_pooling_classes + ) + local_concat_class_remapping = [heatmap_concat_order.index(i) for i in range(self.num_classes)] else: self.dense_heatmap_pooling_class_indices = None self.dense_heatmap_exclude_pooling_classes = None + local_concat_class_remapping = [i for i in range(self.num_classes)] + # Register the remapping as a buffer so it moves to the GPU automatically and gets saved in the state_dict. + self.register_buffer( + "local_concat_class_remapping", torch.tensor(local_concat_class_remapping), persistent=False + ) self.local_heatmap_padding = self.nms_kernel_size // 2 + # NMS clusters self.nms_clusters = self.test_cfg.get("nms_clusters", []) # Add class indices for nms @@ -199,7 +226,8 @@ def __init__( self.partial_ignore_labels = None print_log(f"BEVFusionHead Partial ignore labels: {self.partial_ignore_labels}, dense heatmap pooling classes: \ - {self.dense_heatmap_pooling_classes}, class_names: {self.class_names}", logger="current") + {self.dense_heatmap_pooling_classes}, class_names: {self.class_names}, \ + local_concat_class_remapping: {self.local_concat_class_remapping}", logger="current") def create_2D_grid(self, x_size, y_size): meshgrid = [[0, x_size - 1, x_size], [0, y_size - 1, y_size]] @@ -247,24 +275,21 @@ def forward_single(self, inputs, metas): Returns: list[dict]: Output results for tasks. """ - batch_size = inputs.shape[0] + # batch_size = inputs.shape[0] fusion_feat = self.shared_conv(inputs) ################################# # image to BEV ################################# - fusion_feat_flatten = fusion_feat.view(batch_size, fusion_feat.shape[1], -1) # [BS, C, H*W] - bev_pos = self.bev_pos.repeat(batch_size, 1, 1).to(fusion_feat.device) + fusion_feat_flatten = fusion_feat.view(-1, self.share_conv_out_channels, self.spatial_dim) # [BS, C, H*W] ################################# # query initialization ################################# - with torch.cuda.amp.autocast(enabled=False): + with torch.amp.autocast("cuda", enabled=False): # with torch.autocast('cuda', enabled=False): dense_heatmap = self.heatmap_head(fusion_feat.float()) heatmap = dense_heatmap.detach().sigmoid() - local_max = torch.zeros_like(heatmap) - # equals to nms radius = voxel_size * out_size_factor * kenel_size if self.dense_heatmap_pooling_class_indices is not None: # Pooling selected_heatmap = heatmap[:, self.dense_heatmap_pooling_class_indices, :, :] @@ -274,29 +299,44 @@ def forward_single(self, inputs, metas): stride=1, padding=0, ) - local_max[ - :, - self.dense_heatmap_pooling_class_indices, - self.local_heatmap_padding : (-self.local_heatmap_padding), - self.local_heatmap_padding : (-self.local_heatmap_padding), - ] = local_max_inner - # Non-pooling classes + + # 2. Restore spatial size using F.pad instead of slice mutation + local_max = F.pad( + local_max_inner, + ( + self.local_heatmap_padding, + self.local_heatmap_padding, + self.local_heatmap_padding, + self.local_heatmap_padding, + ), + mode="constant", + value=0.0, + ) + + # 3. Any non-pooling classes if self.dense_heatmap_exclude_pooling_classes: - local_max[:, self.dense_heatmap_exclude_pooling_classes] = heatmap[ - :, self.dense_heatmap_exclude_pooling_classes - ] + excluded_local_max = heatmap[:, self.dense_heatmap_exclude_pooling_classes, :, :] + local_max = torch.cat([local_max, excluded_local_max], dim=1) + local_max = local_max[:, self.local_concat_class_remapping, :, :] else: local_max = heatmap heatmap = heatmap * (heatmap == local_max) - heatmap = heatmap.view(batch_size, heatmap.shape[1], -1) + # (BS, num_classes, H*W) + heatmap = heatmap.view(-1, self.num_classes, self.spatial_dim) # top num_proposals among all classes - top_proposals = heatmap.view(batch_size, -1).argsort(dim=-1, descending=True)[..., : self.num_proposals] - top_proposals_class = top_proposals // heatmap.shape[-1] - top_proposals_index = top_proposals % heatmap.shape[-1] + flattened_heatmap = heatmap.view(-1, self.num_classes * self.spatial_dim) + + # Use topk instead of argsort to avoid sorting the entire flattened heatmap. + top_proposals = topk(x=flattened_heatmap, k=self.num_proposals, dim=-1, sorted=False) + + # 2. Calculate class and spatial indices + # Use shape[-1] dynamically to handle grid sizes safely. + top_proposals_class = top_proposals // self.spatial_dim + top_proposals_index = top_proposals % self.spatial_dim query_feat = fusion_feat_flatten.gather( - index=top_proposals_index[:, None, :].expand(-1, fusion_feat_flatten.shape[1], -1), + index=top_proposals_index[:, None, :].expand(-1, self.share_conv_out_channels, -1), dim=-1, ) self.query_labels = top_proposals_class @@ -306,10 +346,8 @@ def forward_single(self, inputs, metas): query_cat_encoding = self.class_encoding(one_hot.float()) query_feat += query_cat_encoding - query_pos = bev_pos.gather( - index=top_proposals_index[:, None, :].permute(0, 2, 1).expand(-1, -1, bev_pos.shape[-1]), - dim=1, - ) + # (B, N, 2) + query_pos = self.bev_pos.squeeze(0)[top_proposals_index] ################################# # transformer decoder layer (Fusion feature as K,V) ################################# @@ -317,7 +355,9 @@ def forward_single(self, inputs, metas): for i in range(self.num_decoder_layers): # Transformer Decoder Layer # :param query: B C Pq :param query_pos: B Pq 3/6 - query_feat = self.decoder[i](query_feat, key=fusion_feat_flatten, query_pos=query_pos, key_pos=bev_pos) + query_feat = self.decoder[i]( + query_feat, key=fusion_feat_flatten, query_pos=query_pos, key_pos=self.bev_pos + ) # Prediction res_layer = self.prediction_heads[i](query_feat) @@ -384,8 +424,10 @@ def predict_by_feat(self, preds_dicts, metas, img=None, rescale=False, for_roi=F for layer_id, preds_dict in enumerate(preds_dicts): batch_size = preds_dict[0]["heatmap"].shape[0] batch_score = preds_dict[0]["heatmap"][..., -self.num_proposals :].sigmoid() - # if self.loss_iou.loss_weight != 0: - # batch_score = torch.sqrt(batch_score * preds_dict[0]['iou'][..., -self.num_proposals:].sigmoid()) # noqa: E501 + if self.loss_iou is not None: + batch_score = torch.sqrt( + batch_score * preds_dict[0]["iou"][..., -self.num_proposals :].sigmoid() + ) # noqa: E501 one_hot = F.one_hot(self.query_labels, num_classes=self.num_classes).permute(0, 2, 1) batch_score = batch_score * preds_dict[0]["query_heatmap_score"] * one_hot @@ -433,6 +475,7 @@ def predict_by_feat(self, preds_dicts, metas, img=None, rescale=False, for_roi=F circle_nms( boxes_for_nms.detach().cpu().numpy(), nms_cluster["nms_threshold"], + post_max_size=nms_cluster["post_max_size"], ) ) else: @@ -780,8 +823,9 @@ def loss_by_feat( for cls_i, class_name in enumerate(self.class_names): loss_dict[f"loss_heatmap_{class_name}"] = loss_heatmap_cls[cls_i] - # Prevent loss item to avoid computing gradients twice. This is for logging. - loss_dict["total_dense_heatmap"] = loss_heatmap_cls.sum() + # Logging-only aggregate. Detach so it does not retain the autograd graph; + # the per-class `loss_heatmap_{class_name}` entries are what drive gradients. + loss_dict["total_dense_heatmap"] = loss_heatmap_cls.sum().detach() # compute loss for each layer for idx_layer in range(self.num_decoder_layers if self.auxiliary else 1): @@ -855,7 +899,22 @@ def loss_by_feat( loss_dict[f"{prefix}_loss_cls"] = layer_loss_cls loss_dict[f"{prefix}_loss_bbox"] = layer_loss_bbox - # loss_dict[f'{prefix}_loss_iou'] = layer_loss_iou + + # Output iou for iou-aware loss + if self.loss_iou is not None: + layer_ious = preds_dict["iou"][ + ..., + idx_layer * self.num_proposals : (idx_layer + 1) * self.num_proposals, + ].squeeze( + 1 + ) # [BS, num_proposals] + + # [BS, num_proposals] + layer_iou_weights = layer_bbox_weights[:, :, 0] + # print(layer_ious.shape, ious.shape, layer_iou_weights.shape, "layer_ious.shape, ious.shape, layer_iou_weights.shape") + loss_dict[f"{prefix}_loss_iou"] = self.loss_iou( + layer_ious, ious, layer_iou_weights, avg_factor=max(num_pos, 1) + ) loss_dict["matched_ious"] = layer_loss_cls.new_tensor(matched_ious) diff --git a/projects/BEVFusion/bevfusion/bevfusion_voxel_encoder.py b/projects/BEVFusion/bevfusion/bevfusion_voxel_encoder.py new file mode 100644 index 000000000..d7801482a --- /dev/null +++ b/projects/BEVFusion/bevfusion/bevfusion_voxel_encoder.py @@ -0,0 +1,400 @@ +from typing import Optional, Tuple + +import numpy as np +import torch +from mmdet3d.models.voxel_encoders.utils import PFNLayer, get_paddings_indicator +from mmdet3d.registry import MODELS +from torch import Tensor, nn + + +@MODELS.register_module() +class HardSimpleVoxelSinCosEncoder(nn.Module): + def __init__( + self, min_norm_values: Tuple[float], max_norm_values: Tuple[float], in_channels: Optional[int] = 4 + ) -> None: + """ + Simple voxel encoder that only performs mean pooling on the normalize features, and then + performs sin-cos (fourier encoding) on each voxel channels. + + The output shape of each voxel is (N, feature_channels*2). + Args: + min_norm_values (Tuple[float]): Minimum values for the features. + max_norm_values (Tuple[float]): Maximum values for the features. + in_channels (int): Number of input channels. + """ + super().__init__() + + # Create PillarFeatureNet layers + self.in_channels = in_channels + + # Convert the ((x - min) / (max - min)) * pi * exponents to x * scale + bias for folding them into one OP + min_norm_values = torch.tensor(min_norm_values) + max_norm_values = torch.tensor(max_norm_values) + # Let alpha = pi * exponents, beta = max - min + # y = ((x - min) / beta) * alpha + # y = alpha / beta * (x - min) + # y = (alpha / beta) * x - (alpha / beta) * min + # Therefore, scale = alpha / beta, bias = - (alpha * min) / beta + # y = scale * x + bias + exponents = (2 ** torch.arange(0, self.in_channels)).float() + alpha = (torch.pi * exponents).unsqueeze(0) # (1, C) + beta = (max_norm_values - min_norm_values).unsqueeze(1) # (C, 1) + scale = alpha / beta + bias = -(alpha * min_norm_values.unsqueeze(1)) / beta # (C, C) + + self.register_buffer("exponent_scale", scale.unsqueeze(0), persistent=False) # (1, C, C) + self.register_buffer("exponent_bias", bias.unsqueeze(0), persistent=False) # (1, C, C) + + def forward(self, features: Tensor, num_points: Tensor, coors: Tensor, *args, **kwargs) -> Tensor: + """Forward function. + + Args: + features (torch.Tensor): Point features or raw points in shape + (N, M, C) in (x, y, z, intensity, time_lag) if C is 5, (x, y, z, time_lag) if C is 4. + num_points (torch.Tensor): Number of points in each pillar in shape (M). + coors (torch.Tensor): Coordinates of each voxel in (M, [4]), which is (batch_idx, z_idx, y_idx, x_idx). + + Returns: + torch.Tensor: Features of pillars in shape (M, C*C*2). + + """ + # Mean in the voxel + # (N, M, C) -> (N, C) + voxel_mean_features = ( + features.sum(dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1) + ).contiguous() + + # x * scale + bias, (1, C, C) + (1, C, C) * (N, C, 1) -> (N, C, C) + # FMA (fused multiply-add): y = bias + scale * voxel_mean_features + y = torch.addcmul(self.exponent_bias, self.exponent_scale, voxel_mean_features.unsqueeze(-1)) + # SinCos encoding + # (N*C, C) -> (N, C*C) + y = y.reshape(-1, self.in_channels * self.in_channels) + # (N, C*C) -> (N, C*C*2) + voxel_fourier_features = torch.cat([torch.cos(y), torch.sin(y)], dim=1) + + return voxel_fourier_features + + +@MODELS.register_module() +class BEVFusionVoxelFeatureNet(HardSimpleVoxelSinCosEncoder): + """BEVFusion Voxel Encoder Feature Net. + + The network is same as HardSimpleVoxelSinCosEncoder, but it performs PFNLayers on the + offset features, for example, distances. After that, it concatenates the fourier features and the PFN features + along the channel dimension for each voxel. + + Args: + min_norm_values (Tuple[float]): Minimum values for the features. + max_norm_values (Tuple[float]): Maximum values for the features. + in_channels (int): Number of input channels. + feat_channels (tuple, optional): Number of features in each of the + N PFNLayers. Defaults to (64, ). + """ + + def __init__( + self, + min_norm_values: Optional[Tuple[float]] = None, + max_norm_values: Optional[Tuple[float]] = None, + in_channels: Optional[int] = 4, + feat_channels: Optional[tuple] = (64,), + with_distance: Optional[bool] = False, + with_cluster_center: Optional[bool] = True, + with_voxel_center: Optional[bool] = True, + voxel_size: Optional[Tuple[float]] = (0.2, 0.2, 4), + point_cloud_range: Optional[Tuple[float]] = (0, -40, -3, 70.4, 40, 1), + norm_cfg: Optional[dict] = dict(type="BN1d", eps=1e-3, momentum=0.01), + mode: Optional[str] = "max", + legacy: Optional[bool] = True, + ): + + super(BEVFusionVoxelFeatureNet, self).__init__( + min_norm_values=min_norm_values, max_norm_values=max_norm_values, in_channels=in_channels + ) + assert len(feat_channels) > 0 + self.legacy = legacy + pfn_in_channels = in_channels + if with_cluster_center: + pfn_in_channels += 3 + if with_voxel_center: + pfn_in_channels += 3 + if with_distance: + pfn_in_channels += 1 + + assert pfn_in_channels > 0, "pfn_in_channels must be greater than 0" + self._with_distance = with_distance + self._with_cluster_center = with_cluster_center + self._with_voxel_center = with_voxel_center + + # Create VoxelFeatureNet layers + feat_channels = [pfn_in_channels] + list(feat_channels) + pfn_layers = [] + for i in range(len(feat_channels) - 1): + in_filters = feat_channels[i] + out_filters = feat_channels[i + 1] + if i < len(feat_channels) - 2: + last_layer = False + else: + last_layer = True + pfn_layers.append(PFNLayer(in_filters, out_filters, norm_cfg=norm_cfg, last_layer=last_layer, mode=mode)) + self.pfn_layers = nn.ModuleList(pfn_layers) + + # Need pillar (voxel) size and x/y offset in order to calculate offset + self.vx = voxel_size[0] + self.vy = voxel_size[1] + self.vz = voxel_size[2] + self.x_offset = self.vx / 2 + point_cloud_range[0] + self.y_offset = self.vy / 2 + point_cloud_range[1] + self.z_offset = self.vz / 2 + point_cloud_range[2] + self.point_cloud_range = point_cloud_range + + def forward(self, features: Tensor, num_points: Tensor, coors: Tensor, *args, **kwargs) -> Tensor: + """Forward function. + + Args: + features (torch.Tensor): Point features or raw points in shape + (N, M, C) in (x, y, z, intensity, time_lag) if C is 5, (x, y, z, time_lag) if C is 4. + num_points (torch.Tensor): Number of points in each pillar in shape (M). + coors (torch.Tensor): Coordinates of each voxel in (M, [4]), which is (batch_idx, z_idx, y_idx, x_idx). + + Returns: + torch.Tensor: Features of pillars in shape (M, C*C*2 + feat_channels[-1]). + """ + # (M, C*C*2) + voxel_fourier_features = super().forward(features, num_points, coors) + + # Normalize the features + norm_features = (features - self.min_norm_values.view(1, -1)) / ( + (self.max_norm_values - self.min_norm_values).view(1, -1) + ) + + # Offset features + max_points_per_voxel = features.shape[1] + + features_ls = [norm_features] + # Find distance of x, y, and z from cluster center, mapped to [-1, 1] if available + if self._with_cluster_center: + points_mean = features[:, :, :3].sum(dim=1, keepdim=True) / num_points.type_as(features).view(-1, 1, 1) + f_cluster = features[:, :, :3] - points_mean + # Map to [0, 1] if available + # if self.min_norm_values is not None and self.max_norm_values is not None: + # voxel_size = features.new_tensor([self.vx, self.vy, self.vz]) + # f_cluster = f_cluster / voxel_size + features_ls.append(f_cluster) + + # Find distance of x, y, and z from pillar center + dtype = features.dtype + if self._with_voxel_center: + if not self.legacy: + f_center = torch.zeros_like(features[:, :, :3]) + f_center[:, :, 0] = features[:, :, 0] - (coors[:, 3].to(dtype).unsqueeze(1) * self.vx + self.x_offset) + f_center[:, :, 1] = features[:, :, 1] - (coors[:, 2].to(dtype).unsqueeze(1) * self.vy + self.y_offset) + f_center[:, :, 2] = features[:, :, 2] - (coors[:, 1].to(dtype).unsqueeze(1) * self.vz + self.z_offset) + else: + f_center = features[:, :, :3] + f_center[:, :, 0] = f_center[:, :, 0] - ( + coors[:, 3].type_as(features).unsqueeze(1) * self.vx + self.x_offset + ) + f_center[:, :, 1] = f_center[:, :, 1] - ( + coors[:, 2].type_as(features).unsqueeze(1) * self.vy + self.y_offset + ) + f_center[:, :, 2] = f_center[:, :, 2] - ( + coors[:, 1].type_as(features).unsqueeze(1) * self.vz + self.z_offset + ) + + # if self.min_norm_values is not None and self.max_norm_values is not None: + # f_center = f_center / (voxel_size * 0.5) + features_ls.append(f_center) + + if self._with_distance: + points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) + features_ls.append(points_dist) + + # Combine together feature decorations + voxel_feature_offsets = torch.cat(features_ls, dim=-1) + + # The feature decorations were calculated without regard to whether + # pillar was empty. Need to ensure that + # empty pillars remain set to zeros. + mask = get_paddings_indicator(num_points, max_points_per_voxel, axis=0) + mask = torch.unsqueeze(mask, -1).type_as(voxel_feature_offsets) + voxel_feature_offsets *= mask + + # PFN + for pfn in self.pfn_layers: + voxel_feature_offsets = pfn(voxel_feature_offsets, num_points) + + # Concat + features = torch.cat([voxel_fourier_features, voxel_feature_offsets.squeeze(1)], dim=-1) + + return features + + +# @MODELS.register_module() +# class BEVFusionVoxelSinCosEncoder(nn.Module): +# def __init__(self, +# min_norm_values: Tuple[float], +# max_norm_values: Tuple[float], +# time_lag_channel_index: int = 3, +# time_exp_factor: Optional[float] = None, +# feat_channels: Optional[tuple] = (16, ), +# in_channels: Optional[int] = 4, +# with_distance: Optional[bool] = False, +# with_cluster_center: Optional[bool] = True, +# with_voxel_center: Optional[bool] = True, +# voxel_size: Optional[Tuple[float]] = (0.2, 0.2, 4), +# point_cloud_range: Optional[Tuple[float]] = (0, -40, -3, 70.4, +# 40, 1), +# norm_cfg: Optional[dict] = dict( +# type='BN1d', eps=1e-3, momentum=0.01), +# mode: Optional[str] = 'max'): +# super(BEVFusionVoxelSinCosEncoder, self).__init__() + +# self._with_distance = with_distance +# self._with_cluster_center = with_cluster_center +# self._with_voxel_center = with_voxel_center +# # Create PillarFeatureNet layers +# self.in_channels = in_channels + +# # Need pillar (voxel) size and x/y offset in order to calculate offset +# self.vx = voxel_size[0] +# self.vy = voxel_size[1] +# self.vz = voxel_size[2] +# self.x_offset = self.vx / 2 + point_cloud_range[0] +# self.y_offset = self.vy / 2 + point_cloud_range[1] +# self.z_offset = self.vz / 2 + point_cloud_range[2] +# self.point_cloud_range = point_cloud_range + +# self.xyz_channels = 3 +# feat_offset_channels = in_channels - self.xyz_channels +# if with_cluster_center: +# feat_offset_channels += 3 +# if with_voxel_center: +# feat_offset_channels += 3 +# if with_distance: +# feat_offset_channels += 1 + +# feat_channels = [feat_offset_channels] + list(feat_channels) +# assert len(feat_channels) > 0, "feat_channels must be greater than 0" +# pfn_layers = [] +# for i in range(len(feat_channels) - 1): +# in_filters = feat_channels[i] +# out_filters = feat_channels[i + 1] +# if i < len(feat_channels) - 2: +# last_layer = False +# else: +# last_layer = True +# pfn_layers.append( +# PFNLayer( +# in_filters, +# out_filters, +# norm_cfg=norm_cfg, +# last_layer=last_layer, +# mode=mode)) +# self.pfn_layers = nn.ModuleList(pfn_layers) + +# self.time_lag_channel_index = time_lag_channel_index +# self.time_exp_factor = time_exp_factor + +# self.register_buffer("min_norm_values", torch.tensor(min_norm_values)) +# self.register_buffer("max_norm_values", torch.tensor(max_norm_values)) +# self.register_buffer("voxel_size", torch.tensor([self.vx, self.vy, self.vz])) +# self.register_buffer("exponents", (2 ** torch.arange(0, self.xyz_channels)).float()) + +# def forward(self, features: Tensor, num_points: Tensor, coors: Tensor, +# *args, **kwargs) -> Tensor: +# """Forward function. + +# Args: +# features (torch.Tensor): Point features or raw points in shape +# (N, M, C). +# num_points (torch.Tensor): Number of points in each pillar in shape (M). +# coors (torch.Tensor): Coordinates of each voxel in (M, [4]), which is (batch_idx, z_idx, y_idx, x_idx). + +# Returns: +# torch.Tensor: Features of pillars in shape (M, C). +# """ +# num_voxels, max_points_per_voxel = features.shape[0], features.shape[1] + +# # Mean in the voxel +# # (N, M, 3) -> (N, 3) +# voxel_features = (features[:, :, :self.xyz_channels].sum(dim=1, keepdim=False) / num_points.type_as(features).view( +# -1, 1)).contiguous() + +# # min-max normalization, (N, 3) -> (N, 3) +# voxel_features_norm = (voxel_features - \ +# self.min_norm_values[:self.xyz_channels].view(1, -1)) / ((self.max_norm_values[:self.xyz_channels] - self.min_norm_values[:self.xyz_channels]).view(1, -1)) + +# # SinCos encoding +# # (N, 3) -> (N*3, 1) * (1, ) * (1, 3) -> (N*3, 3) +# y = voxel_features_norm.reshape(-1, 1) * np.pi * self.exponents.reshape(1, -1) +# # (N*3, 3) -> (N, 3*3) +# y = y.reshape(num_voxels, -1) +# # (N, 3*3) -> (N, 3*3*2) +# voxel_fourier_features = torch.cat([torch.cos(y), torch.sin(y)], dim=1) + +# # PFN +# # Other features, for example, intensity or time_lag +# other_features = features[:, :, self.xyz_channels:] + +# # Normalization +# other_features_norm = (other_features - self.min_norm_values[self.xyz_channels:]) / (self.max_norm_values[self.xyz_channels:] - self.min_norm_values[self.xyz_channels:]) + +# time_lag_feature_index = self.time_lag_channel_index - self.xyz_channels +# # exponentiate time_lag features, it's higher when the normlized time lag is lower +# # (1.0 when time_lag_features is 0.0) +# if self.time_exp_factor is not None: +# other_features_norm[:, :, time_lag_feature_index] = torch.exp(- other_features_norm[:, :, time_lag_feature_index] * self.time_exp_factor) +# else: +# # Inverse the time_lag feature +# other_features_norm[:, :, time_lag_feature_index] = 1.0 - other_features_norm[:, :, time_lag_feature_index] + +# # Offsets +# voxel_feature_offsets = [other_features_norm] +# # Find distance of x, y, and z from cluster center +# if self._with_cluster_center: +# points_mean = features[:, :, :3].sum( +# dim=1, keepdim=True) / num_points.type_as(features).view( +# -1, 1, 1) + +# # f_cluster = (features[:, :, :3] - points_mean) +# f_cluster = features[:, :, :3] - points_mean +# voxel_feature_offsets.append(f_cluster) + +# # Find distance of x, y, and z from pillar center +# dtype = features.dtype +# if self._with_voxel_center: +# f_center = torch.zeros_like(features[:, :, :3]) +# f_center[:, :, 0] = features[:, :, 0] - ( +# coors[:, 3].to(dtype).unsqueeze(1) * self.vx + +# self.x_offset) +# f_center[:, :, 1] = features[:, :, 1] - ( +# coors[:, 2].to(dtype).unsqueeze(1) * self.vy + +# self.y_offset) +# f_center[:, :, 2] = features[:, :, 2] - ( +# coors[:, 1].to(dtype).unsqueeze(1) * self.vz + +# self.z_offset) + +# # Map to [-1, 1] +# # f_center = f_center / (self.voxel_size * 0.5) +# voxel_feature_offsets.append(f_center) + +# if self._with_distance: +# points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) +# voxel_feature_offsets.append(points_dist) + +# voxel_feature_offsets = torch.cat(voxel_feature_offsets, dim=-1) +# # The feature decorations were calculated without regard to whether +# # pillar was empty. Need to ensure that +# # empty pillars remain set to zeros. +# mask = get_paddings_indicator(num_points, max_points_per_voxel, axis=0) +# mask = torch.unsqueeze(mask, -1).type_as(voxel_feature_offsets) +# voxel_feature_offsets *= mask + +# # PFN +# for pfn in self.pfn_layers: +# voxel_feature_offsets = pfn(voxel_feature_offsets, num_points) + +# # Concat +# features = torch.cat([voxel_fourier_features, voxel_feature_offsets.squeeze(1)], dim=-1) +# return features diff --git a/projects/BEVFusion/bevfusion/custom_sparse_conv_tensor.py b/projects/BEVFusion/bevfusion/custom_sparse_conv_tensor.py new file mode 100644 index 000000000..888d2b1c0 --- /dev/null +++ b/projects/BEVFusion/bevfusion/custom_sparse_conv_tensor.py @@ -0,0 +1,37 @@ +""" +Custom SparseConvTensor for BEVFusion. +This customiztion is used to support cleaner ONNX export of sparse convolutions. +""" + +import torch +from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE + +if IS_SPCONV2_AVAILABLE: + from spconv.pytorch import SparseConvTensor +else: + from mmcv.ops import SparseConvTensor + + +def sparse_to_dense(sparse_tensor: SparseConvTensor, batch_size: int, spatial_shapes: list[int], out_channels: int): + """ + Convert the sparse tensor to a dense tensor. + """ + H, W, D = spatial_shapes + num_cells = batch_size * H * W * D + idx = sparse_tensor.indices.to(sparse_tensor.features.device).long() # [N, 1+D] + b, h, w, d = idx.unbind(1) + # b * (H * W * D) + h*(W*D) + w*D + d + # Factor out the common terms D and W + # (b*H*W + h*W + w) * D + d -> (b*H + h) * W + w) * D + d + linear_idx = ((b * H + h) * W + w) * D + d # [N] + + out = torch.zeros( + [num_cells, sparse_tensor.features.shape[1]], + device=sparse_tensor.features.device, + dtype=sparse_tensor.features.dtype, + ) + # out = out.index_copy(0, linear_idx, sparse_tensor.features) + # out = out.scatter(0, linear_idx, sparse_tensor.features) + scatter_idx = linear_idx.unsqueeze(1).expand(-1, out_channels) # [N, C] + out = out.scatter(0, scatter_idx, sparse_tensor.features) + return out.view(batch_size, H, W, D, out_channels) diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py index ac7c5b503..d0a547258 100644 --- a/projects/BEVFusion/bevfusion/depth_lss.py +++ b/projects/BEVFusion/bevfusion/depth_lss.py @@ -1,8 +1,13 @@ # modify from https://github.com/mit-han-lab/bevfusion +import math +from pathlib import Path from typing import Tuple +import matplotlib.pyplot as plt +import numpy as np import torch from mmdet3d.registry import MODELS +from mmengine.logging import print_log from torch import nn from .ops import bev_pool @@ -164,6 +169,7 @@ def __init__( ybound: Tuple[float, float, float], zbound: Tuple[float, float, float], dbound: Tuple[float, float, float], + visualize_bev_feat: bool = False, ) -> None: super().__init__() self.in_channels = in_channels @@ -183,6 +189,7 @@ def __init__( self.frustum = self.create_frustum() self.D = self.frustum.shape[0] self.fp16_enabled = False + self.visualize_bev_feat = visualize_bev_feat def create_frustum(self): iH, iW = self.image_size @@ -319,8 +326,55 @@ def bev_pool_precomputed(self, x, geom_feats, kept, ranks, indices): # collapse Z final = torch.cat(x.unbind(dim=2), 1) + if self.visualize_bev_feat: + self.plot_bev_feat(final) + return final + def plot_bev_feat(self, bev_feat): + """Visualize the BEV feat for the given batch index.""" + try: + import torch.distributed as dist + + if dist.is_available() and dist.is_initialized() and dist.get_rank() != 0: + return + except ImportError: + pass + + batch_idx = 0 + if bev_feat.shape[0] <= batch_idx: + return + + # save first 10 raw channel maps for one batch sample (B, C, Y, X) + num_channels = 10 + with torch.no_grad(): + feat = bev_feat[batch_idx].detach().float().cpu().numpy() + channel_indices = np.arange(min(num_channels, feat.shape[0])) + ncols = min(5, len(channel_indices)) + nrows = math.ceil(len(channel_indices) / ncols) + fig, axes = plt.subplots(nrows, ncols, figsize=(3 * ncols, 3 * nrows), squeeze=False) + for ax, ch_idx in zip(axes.ravel(), channel_indices): + ch_map = feat[ch_idx] + im = ax.imshow(ch_map, cmap="viridis", origin="lower", aspect="equal") + ax.set_title(f"ch {ch_idx}", fontsize=9) + ax.set_xlabel("X") + ax.set_ylabel("Y") + fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04) + for ax in axes.ravel()[len(channel_indices) :]: + ax.axis("off") + fig.suptitle(f"bev_feat channels 0-{len(channel_indices) - 1} (batch={batch_idx})") + fig.tight_layout() + + save_dir = Path("work_dirs/bev_feat_vis_2") + save_dir.mkdir(parents=True, exist_ok=True) + if not hasattr(self, "_bev_feat_vis_count"): + self._bev_feat_vis_count = 0 + self._bev_feat_vis_count += 1 + save_path = save_dir / f"bev_feat_batch{batch_idx}_{self._bev_feat_vis_count:06d}.png" + fig.savefig(save_path, dpi=150, bbox_inches="tight") + plt.close(fig) + print_log(f"Saved BEV feat visualization to {save_path.resolve()}") + def forward( self, img, diff --git a/projects/BEVFusion/bevfusion/depth_lss_v2.py b/projects/BEVFusion/bevfusion/depth_lss_v2.py new file mode 100644 index 000000000..a95a88e4e --- /dev/null +++ b/projects/BEVFusion/bevfusion/depth_lss_v2.py @@ -0,0 +1,534 @@ +import math +from pathlib import Path +from typing import Optional, Tuple + +import numpy as np +import torch +from mmdet3d.registry import MODELS +from mmengine.logging import print_log +from torch import nn +from torch.utils.checkpoint import checkpoint + +from .depth_lss import BaseViewTransform, DepthLSSNet, DownSampleNet, LidarDepthImageNet +from .ops import bev_pool_v2 + + +class SELayer(nn.Module): + """ + Squeeze-and-Excitation (SE) layer. + This is used to modulate features with camera-depth aware parameters. + The code is taken from BEVDET (https://github.com/hustvl/BEVDET). + """ + + def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid): + super().__init__() + # Dont need global pooling because inputs are (B*N, C, 1, 1). + self.sequeeze_net = nn.Sequential( + # Squeeze with 1x1 convolution + nn.Conv2d(channels, channels, 1, bias=True), + # Activation + act_layer(), + # Expand with 1x1 convolution + nn.Conv2d(channels, channels, 1, bias=True), + # Gate with sigmoid activation + gate_layer(), + ) + + def forward(self, x: torch.Tensor, depth_aware_features: torch.Tensor) -> torch.Tensor: + """ + Args: + x: Tuple[torch.Tensor, torch.Tensor], the input tuple containing the image features and camera-depth aware parameters. + Returns: + torch.Tensor, the output tensor of shape (B, N, C). + """ + feature_attentions = self.sequeeze_net(depth_aware_features) + return x * feature_attentions + + +class CameraDepthLinearProjectionMLP(nn.Module): + """ + Linear projection module by MLP. This is used to project image (context) features and camera-depth + aware parameters (for example, intrinsics) to embedding space. + The code is taken from BEVDET (https://github.com/hustvl/BEVDET). + """ + + def __init__(self, in_channels: int, hidden_channels: int, out_channels: int, drop_out: float = 0.0): + """ + Args: + in_channels: int, the number of input channels. + hidden_channels: int, the number of hidden channels. + out_channels: int, the number of output channels. + drop_out: float, the dropout rate. + """ + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.drop_out = drop_out + + self.sequential_mlp = nn.Sequential( + nn.Linear(in_channels, hidden_channels), + nn.ReLU(inplace=True), + nn.Dropout(drop_out), + nn.Linear(hidden_channels, out_channels), + nn.Dropout(drop_out), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Args: + x: torch.Tensor, the input tensor of shape (B, N, C). + Returns: + torch.Tensor, the output tensor of shape (B, N, C). + """ + return self.sequential_mlp(x) + + +class CameraDepthAwareNet(nn.Module): + """ + Camera-depth aware depth net. This is used to predict the depth of the scene. + The code is taken from BEVDET (https://github.com/hustvl/BEVDET). + """ + + def __init__( + self, + in_channels: int, + hidden_channels: int, + out_channels: int, + mlp_drop_out: float, + depth_channels: int, + with_cp: bool = False, + num_camera_depth_parameters: int = 27, + ) -> None: + """ + Args: + in_channels: int, the number of input channels. + out_channels: int, the number of output channels. + mlp_drop_out: float, the dropout rate of the MLP. + mlp_hidden_channels: int, the number of hidden channels of the MLP. + mlp_out_channels: int, the number of output channels of the MLP. + """ + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.mlp_drop_out = mlp_drop_out + self.num_camera_depth_parameters = num_camera_depth_parameters + self.depth_channels = depth_channels + self.with_cp = with_cp + + # Input convolution for context/image features + # Camera depth aware parameters branch + self.camera_depth_aware_parameters_bn = nn.BatchNorm1d(self.num_camera_depth_parameters) + + # Context/image feature branch + self.context_input_conv = nn.Sequential( + nn.Conv2d(in_channels, hidden_channels, kernel_size=1, stride=1, bias=False), + nn.BatchNorm2d(hidden_channels), + nn.ReLU(inplace=True), + ) + self.context_camera_depth_aware_mlp = CameraDepthLinearProjectionMLP( + in_channels=self.num_camera_depth_parameters, + hidden_channels=hidden_channels, + out_channels=hidden_channels, + drop_out=self.mlp_drop_out, + ) + self.context_se = SELayer(channels=hidden_channels) + self.context_conv = nn.Conv2d(hidden_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True) + + # Depth branch + self.depth_camera_depth_aware_mlp = CameraDepthLinearProjectionMLP( + in_channels=self.num_camera_depth_parameters, + hidden_channels=hidden_channels, + out_channels=hidden_channels, + drop_out=self.mlp_drop_out, + ) + self.depth_se = SELayer(channels=hidden_channels) + self.depth_conv = nn.Sequential( + nn.Conv2d(hidden_channels, depth_channels, kernel_size=1, stride=1, padding=0, bias=True) + ) + + def context_forward( + self, context_features: torch.Tensor, camera_depth_aware_features: torch.Tensor + ) -> torch.Tensor: + """ + Args: + x: torch.Tensor, the input tensor of shape (B*N, C, H, W). + camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B*N, N_CAMERA_DEPTH_PARAMETERS). + Returns: + torch.Tensor, the output tensor of shape (B*N, C, H, W). + """ + context_camera_depth_aware_features = self.context_camera_depth_aware_mlp(camera_depth_aware_features) + # # (B*N, mlp_out_channels) -> (B*N, mlp_out_channels, 1, 1) + context_camera_depth_aware_features = context_camera_depth_aware_features.view(-1, self.hidden_channels, 1, 1) + context_features = self.context_se(context_features, context_camera_depth_aware_features) + context_features = self.context_conv(context_features) + return context_features + + def depth_forward(self, depth_features: torch.Tensor, camera_depth_aware_features: torch.Tensor) -> torch.Tensor: + """ + Args: + depth_features: torch.Tensor, the input tensor of shape (B*N, C, H, W). + camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B, N, D). + Returns: + torch.Tensor, the output tensor of shape (B*N, C, H, W). + """ + depth_camera_depth_aware_features = self.depth_camera_depth_aware_mlp(camera_depth_aware_features) + # # (B*N, mlp_out_channels) -> (B*N, mlp_out_channels, 1, 1) + depth_camera_depth_aware_features = depth_camera_depth_aware_features.view(-1, self.hidden_channels, 1, 1) + # # (B*N, C, H, W) + depth_features = self.depth_se(depth_features, depth_camera_depth_aware_features) + if self.with_cp: + depth_features = checkpoint(self.depth_conv, depth_features) + else: + depth_features = self.depth_conv(depth_features) + return depth_features + + def forward(self, x: torch.Tensor, camera_depth_aware_parameters: torch.Tensor) -> torch.Tensor: + """ + Args: + x: torch.Tensor, the input tensor of shape (B, N, C, H, W). + camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B, N, N_CAMERA_DEPTH_PARAMETERS). + Returns: + torch.Tensor, the output tensor of shape (B*N, C, H, W). + """ + # (B, N, N_CAMERA_DEPTH_PARAMETERS) -> (B*N, N_CAMERA_DEPTH_PARAMETERS) + camera_depth_aware_parameters = camera_depth_aware_parameters.view(-1, self.num_camera_depth_parameters) + + # (B*N, N_CAMERA_DEPTH_PARAMETERS) + camera_depth_aware_features = self.camera_depth_aware_parameters_bn(camera_depth_aware_parameters) + context_input_features = self.context_input_conv(x) + context_features = self.context_forward(context_input_features, camera_depth_aware_features) + depth_features = self.depth_forward(context_input_features, camera_depth_aware_features) + return torch.cat([depth_features, context_features], dim=1) + + +class BaseViewTransformV2(BaseViewTransform): + + def __init__( + self, + in_channels: int, + out_channels: int, + image_size: Tuple[int, int], + feature_size: Tuple[int, int], + xbound: Tuple[float, float, float], + ybound: Tuple[float, float, float], + zbound: Tuple[float, float, float], + dbound: Tuple[float, float, float], + collapse_z: bool = True, + expand_batch_axis: bool = False, + visualize_bev_feat: bool = False, + ): + """ + Args: + collapse_z: collapse the Z axis of the BEV grid + expand_batch_axis: expand the batch axis of the inputs to bev pool if this is set to True. + """ + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + image_size=image_size, + feature_size=feature_size, + xbound=xbound, + ybound=ybound, + zbound=zbound, + dbound=dbound, + visualize_bev_feat=visualize_bev_feat, + ) + self.collapse_z = collapse_z + self.expand_batch_axis = expand_batch_axis + + def get_cam_feats( + self, x, camera_depth_aware_parameters: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError + + def forward( + self, + img, + points, + lidar2image, + camera_intrinsics, + camera2lidar, + img_aug_matrix, + lidar_aug_matrix, + metas, + camera_intrinsics_inverse, + img_aug_matrix_inverse, + lidar_aug_matrix_inverse, + geom_feats_precomputed, + camera_depth_aware_parameters: Optional[torch.Tensor] = None, + ): + if geom_feats_precomputed is not None: + ranks_bev, ranks_depth, ranks_feat = geom_feats_precomputed + x, depth_softmax = self.get_cam_feats(img) + x = self.bev_pool_precomputed(x, depth_softmax, ranks_bev, ranks_depth, ranks_feat) + + # No return depth predictions when precomputed geometry features are used + depth_softmax = None + + else: + intrins = camera_intrinsics[..., :3, :3] + post_rots = img_aug_matrix[..., :3, :3] + post_trans = img_aug_matrix[..., :3, 3] + camera2lidar_rots = camera2lidar[..., :3, :3] + camera2lidar_trans = camera2lidar[..., :3, 3] + + extra_rots = lidar_aug_matrix[..., :3, :3] + extra_trans = lidar_aug_matrix[..., :3, 3] + + geom = self.get_geometry( + camera2lidar_rots, + camera2lidar_trans, + torch.inverse(intrins), + torch.inverse(post_rots), + post_trans, + extra_rots=extra_rots, + extra_trans=extra_trans, + ) + + # depth is not connected to the calibration + # on_img is + # is also flattened_indices + ( + view_feats, + depth_softmax, + ) = self.get_cam_feats(img, camera_depth_aware_parameters) + x = self.bev_pool(view_feats, depth_softmax, geom) + + return x, depth_softmax + + def bev_pool_aux(self, geom_feats): + B, N, D, H, W, C = geom_feats.shape + Nprime = B * N * D * H * W + assert C == 3 + + # record the index of selected points for acceleration purpose + ranks_depth = torch.arange(0, Nprime, dtype=torch.int, device=geom_feats.device) + ranks_feat = torch.arange(0, Nprime // D, dtype=torch.int, device=geom_feats.device) + ranks_feat = ranks_feat.reshape(B, N, 1, H, W) + ranks_feat = ranks_feat.expand(B, N, D, H, W).flatten() + + # flatten indices + geom_feats = ((geom_feats - (self.bx - self.dx / 2.0)) / self.dx).long() + geom_feats = geom_feats.view(Nprime, 3) + batch_ix = torch.cat( + [torch.full([Nprime // B, 1], ix, device=geom_feats.device, dtype=torch.long) for ix in range(B)] + ) + geom_feats = torch.cat((geom_feats, batch_ix), 1) + + # filter out points that are outside box + kept = ( + (geom_feats[:, 0] >= 0) + & (geom_feats[:, 0] < self.nx[0]) + & (geom_feats[:, 1] >= 0) + & (geom_feats[:, 1] < self.nx[1]) + & (geom_feats[:, 2] >= 0) + & (geom_feats[:, 2] < self.nx[2]) + ) + + if len(kept) == 0: + return None, None, None + + geom_feats, ranks_depth, ranks_feat = geom_feats[kept], ranks_depth[kept], ranks_feat[kept] + + # Switch x and y to match the order of the BEV grid + ranks_bev = ( + geom_feats[:, 3] * (self.nx[2] * self.nx[1] * self.nx[0]) + + geom_feats[:, 2] * (self.nx[1] * self.nx[0]) + + geom_feats[:, 0] * self.nx[1] + + geom_feats[:, 1] + ) + indices = ranks_bev.argsort() + ranks_bev, ranks_depth, ranks_feat = ranks_bev[indices], ranks_depth[indices], ranks_feat[indices] + return ( + ranks_bev.int().contiguous(), + ranks_depth.int().contiguous(), + ranks_feat.int().contiguous(), + ) + + def compute_intervals(self, ranks_bev: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: + if ranks_bev is None: + return None, None + + kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) + kept[1:] = ranks_bev[1:] != ranks_bev[:-1] + interval_starts = torch.where(kept)[0].int() + if len(interval_starts) == 0: + return None, None + + interval_lengths = torch.zeros_like(interval_starts) + interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] + interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1] + return interval_starts.int().contiguous(), interval_lengths.int().contiguous() + + def bev_pool(self, view_feats, depth_softmax, geom) -> torch.Tensor: + """ """ + ranks_bev, ranks_depth, ranks_feat = self.bev_pool_aux(geom) + interval_starts, interval_lengths = self.compute_intervals(ranks_bev) + bev_feat = self.compute_bev_pool( + view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths + ) + return bev_feat + + def compute_bev_pool( + self, view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths + ): + """Compute the BEV pool for the given view features, depth softmax, ranks, and intervals.""" + if interval_starts is None: + print_log("warning ---> no points within the predefined bev receptive field") + dummy = torch.zeros( + size=[view_feats.shape[0], view_feats.shape[2], self.nx[2], self.nx[1], self.nx[0]], + dtype=view_feats.dtype, + device=view_feats.device, + ) + if self.collapse_z: + dummy = torch.cat(dummy.unbind(dim=2), 1) + return dummy + + if self.expand_batch_axis: + view_feats = view_feats.unsqueeze(0) + depth_softmax = depth_softmax.unsqueeze(0) + + # permute view_feats from (B, N, C, fH, fW) to (B, N, fH, fW, C) + view_feats = view_feats.permute(0, 1, 3, 4, 2) + bev_feat_shape = ( + depth_softmax.shape[0], + int(self.nx[2]), + int(self.nx[1]), + int(self.nx[0]), + view_feats.shape[-1], + ) # (B, Z, Y, X, C) + bev_feat = bev_pool_v2( + depth=depth_softmax, + feat=view_feats, + ranks_depth=ranks_depth, + ranks_feat=ranks_feat, + ranks_bev=ranks_bev, + interval_starts=interval_starts, + interval_lengths=interval_lengths, + bev_feat_shape=bev_feat_shape, + is_training=self.training, + ) + + # collapse Z + if self.collapse_z: + bev_feat = torch.cat(bev_feat.unbind(dim=2), 1) + + if self.visualize_bev_feat: + self.plot_bev_feat(bev_feat) + + return bev_feat + + def bev_pool_precomputed(self, view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat): + interval_starts, interval_lengths = self.compute_intervals(ranks_bev) + bev_feat = self.compute_bev_pool( + view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths + ) + return bev_feat + + def get_depth_softmax(self, x: torch.Tensor, B, N, fH, fW) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: torch.Tensor, the input tensor of shape (B*N, D+C, H, W). + Returns: + Tuple[torch.Tensor, torch.Tensor], the tuple containing the view features and depth softmax. + view_feats: torch.Tensor, the view features of shape (B, N, C, H, W). + depth_softmax: torch.Tensor, the depth softmax of shape (B, N, D, H, W). + """ + depth_softmax = x[:, : self.D].softmax(dim=1) + depth_softmax = depth_softmax.view(B, N, self.D, fH, fW) + view_feats = x[:, self.D : (self.D + self.C)] + view_feats = view_feats.view(B, N, self.C, fH, fW) + return view_feats, depth_softmax + + +@MODELS.register_module() +class LSSTransformV2(BaseViewTransformV2): + + def __init__( + self, + in_channels: int, + out_channels: int, + image_size: Tuple[int, int], + feature_size: Tuple[int, int], + xbound: Tuple[float, float, float], + ybound: Tuple[float, float, float], + zbound: Tuple[float, float, float], + dbound: Tuple[float, float, float], + downsample: int = 1, + ): + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + image_size=image_size, + feature_size=feature_size, + xbound=xbound, + ybound=ybound, + zbound=zbound, + dbound=dbound, + ) + self.depthnet = nn.Conv2d(self.in_channels, self.D + self.C, 1) + self.downsample = DownSampleNet(downsample, out_channels, out_channels) + + def get_cam_feats( + self, x: torch.Tensor, camera_depth_aware_parameters: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + B, N, C, fH, fW = x.shape + x = x.view(B * N, C, fH, fW) + x = self.depthnet(x) + return self.get_depth_softmax(x, B=B, N=N, fH=fH, fW=fW) + + def forward(self, *args, **kwargs): + x, depth_softmax = super().forward(*args, **kwargs) + x = self.downsample(x) + return x, depth_softmax + + +@MODELS.register_module() +class LSSTransformV2DepthAware(BaseViewTransformV2): + + def __init__( + self, + in_channels: int, + out_channels: int, + image_size: Tuple[int, int], + feature_size: Tuple[int, int], + xbound: Tuple[float, float, float], + ybound: Tuple[float, float, float], + zbound: Tuple[float, float, float], + dbound: Tuple[float, float, float], + camera_depth_aware_configs: dict, + downsample: int = 1, + ): + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + image_size=image_size, + feature_size=feature_size, + xbound=xbound, + ybound=ybound, + zbound=zbound, + dbound=dbound, + ) + self.downsample = DownSampleNet(downsample, out_channels, out_channels) + self.camera_depth_aware_net = CameraDepthAwareNet( + in_channels=in_channels, + hidden_channels=in_channels, + mlp_drop_out=camera_depth_aware_configs["mlp_drop_out"], + depth_channels=self.D, + out_channels=self.C, + ) + + def get_cam_feats( + self, x: torch.Tensor, camera_depth_aware_parameters: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + B, N, C, fH, fW = x.shape + x = x.view(B * N, C, fH, fW) + x = self.camera_depth_aware_net(x, camera_depth_aware_parameters) + return self.get_depth_softmax(x, B=B, N=N, fH=fH, fW=fW) + + def forward(self, *args, **kwargs): + x, depth_softmax = super().forward(*args, **kwargs) + x = self.downsample(x) + return x, depth_softmax diff --git a/projects/BEVFusion/bevfusion/loading.py b/projects/BEVFusion/bevfusion/loading.py index 0478d67a3..7f3322fda 100644 --- a/projects/BEVFusion/bevfusion/loading.py +++ b/projects/BEVFusion/bevfusion/loading.py @@ -1,10 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy -import os -from typing import List, Optional +from pathlib import Path +from typing import List, Optional, Tuple +import matplotlib.pyplot as plt import mmcv import numpy as np +from mmcv.transforms import BaseTransform from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles from mmdet3d.registry import TRANSFORMS from mmengine.fileio import get @@ -217,3 +219,219 @@ def transform(self, results: dict) -> Optional[dict]: results["num_views"] = self.num_views results["num_ref_frames"] = self.num_ref_frames return results + + +@TRANSFORMS.register_module() +class PointsToMultiViewImageDepths(BaseTransform): + """Convert points to multi-view image depths. + + Args: + points (np.ndarray): Points in the world coordinate system. + img_shape (tuple): Shape of the image. + cam2img (np.ndarray): Camera to image transformation matrix. + lidar2cam (np.ndarray): LiDAR to camera transformation matrix. + visualize_dir (str, optional): If set, saves a per-sample subplot + of `gt_depths` (one panel per camera) to this directory. + Useful for debugging the projection. Defaults to None. + max_depth (float): Upper clip for the depth color scale (m). + Defaults to 80. + """ + + def __init__( + self, + img_shape, + num_cameras: int, + depth_bounds: Tuple[float, float], + visualize_dir: Optional[str] = None, + max_depth: float = 80.0, + ): + self.img_shape = img_shape + self.num_cameras = num_cameras + self.visualize_dir = visualize_dir + self.max_depth = max_depth + self.depth_bounds = depth_bounds + self.visualize_dir = Path(visualize_dir) if visualize_dir is not None else None + if self.visualize_dir is not None: + self.visualize_dir.mkdir(parents=True, exist_ok=True) + self._depth_idx = 0 + + def transform(self, results: dict) -> Optional[dict]: + """Call function to load multi-view image from files. + + Args: + results (dict): Result dict containing multi-view image filenames. + + Returns: + dict: The result dict containing the multi-view image data. + Added keys: + - gt_depths (np.ndarray): Ground truth depths in (N, H, W) for (number of cameras, height, width). + """ + lidar2image = np.asarray(results["lidar2img"]) + img_aug_matrix = np.asarray(results["img_aug_matrix"]) if "img_aug_matrix" in results else np.eye(4) + cur_coords = results["points"].numpy()[:, :3] + + # inverse lidar aug + if "lidar_aug_matrix" in results: + lidar_aug_matrix = np.asarray(results["lidar_aug_matrix"]) + lidar_aug_matrix_inverse = np.linalg.inv(lidar_aug_matrix) + cur_coords -= lidar_aug_matrix[:3, 3] + cur_coords = lidar_aug_matrix_inverse[:3, :3] @ cur_coords.transpose(1, 0) + else: + cur_coords = cur_coords.transpose(1, 0) + + # lidar2image + cur_coords = lidar2image[:, :3, :3] @ cur_coords + cur_coords += lidar2image[:, :3, 3].reshape(-1, 3, 1) + + # get 2d coords + dist = cur_coords[:, 2, :] + valid_dist_mask = (dist >= self.depth_bounds[0]) & (dist < self.depth_bounds[1]) + + cur_coords[:, 2, :] = np.clip(cur_coords[:, 2, :], 1e-5, 1e5) + cur_coords[:, :2, :] /= cur_coords[:, 2:3, :] + + # imgaug + cur_coords = img_aug_matrix[:, :3, :3] @ cur_coords + cur_coords += img_aug_matrix[:, :3, 3].reshape(-1, 3, 1) + cur_coords = cur_coords[:, :2, :].transpose(0, 2, 1) + + # normalize coords for grid sample + cur_coords = cur_coords[..., [1, 0]] + on_img = ( + (cur_coords[..., 0] < self.img_shape[0]) + & (cur_coords[..., 0] >= 0) + & (cur_coords[..., 1] < self.img_shape[1]) + & (cur_coords[..., 1] >= 0) + & valid_dist_mask + ) + + # Avoid loops since it's slow + indices = np.nonzero(on_img) + camera_indices = indices[0] + point_indices = indices[1] + masked_coords = cur_coords[camera_indices, point_indices].astype(np.int64) + masked_dist = dist[camera_indices, point_indices] + + # Possibly to have duplicates and the last one will be used, however, the chance is small + flatten_indices = ( + camera_indices * self.img_shape[0] * self.img_shape[1] + + masked_coords[:, 0] * self.img_shape[1] + + masked_coords[:, 1] + ) + depth_flat = np.zeros(self.num_cameras * self.img_shape[0] * self.img_shape[1], dtype=np.float32) + depth_flat[flatten_indices] = masked_dist + depth = depth_flat.reshape(self.num_cameras, self.img_shape[0], self.img_shape[1]) + results["gt_depths"] = depth + + if self.visualize_dir is not None: + self._save_depth_subplot(depth, results) + return results + + def _save_depth_subplot(self, depth: np.ndarray, results: dict) -> None: + """Save `gt_depths` as a subplot with one panel per camera. + + The figure contains three row blocks per camera: + - image underlay (if available) + projected LiDAR depth points + - image pixels only + - depth-only heatmap (no image pixel values) + + Args: + depth (np.ndarray): (num_cameras, H, W) ground-truth depth map. + results (dict): The pipeline result dict; used for the underlay + image and to derive a unique filename. + """ + imgs = results.get("img", None) + + # Layout: + # - Top block: image underlay + projected depth points. + # - Middle block: image pixels only. + # - Bottom block: depth-only heatmap (no image pixel values). + if self.num_cameras <= 6: + base_rows, cols = 1, self.num_cameras + else: + cols = int(np.ceil(np.sqrt(self.num_cameras))) + base_rows = int(np.ceil(self.num_cameras / cols)) + rows = base_rows * 3 + + fig, axes = plt.subplots(rows, cols, figsize=(4 * cols, 4 * rows), squeeze=False) + + for c in range(self.num_cameras): + d = depth[c] + ys, xs = np.nonzero(d) + vals = d[ys, xs] + + # Row block 1: image + depth scatter. + ax_overlay = axes[c // cols, c % cols] + if imgs is not None and c < len(imgs): + ax_overlay.imshow(imgs[c].astype(np.uint8)) + if vals.size > 0: + ax_overlay.scatter( + xs, + ys, + c=vals, + cmap="turbo", + vmin=0, + vmax=self.max_depth, + s=1, + ) + else: + ax_overlay.imshow( + d, + cmap="turbo", + vmin=0, + vmax=self.max_depth, + interpolation="nearest", + ) + ax_overlay.set_title(f"cam {c} overlay ({vals.size} pts)") + ax_overlay.set_xticks([]) + ax_overlay.set_yticks([]) + + # Row block 2: image-only visualization. + ax_img = axes[base_rows + (c // cols), c % cols] + if imgs is not None and c < len(imgs): + ax_img.imshow(imgs[c].astype(np.uint8)) + else: + ax_img.imshow( + d, + cmap="gray", + vmin=0, + vmax=self.max_depth, + interpolation="nearest", + ) + ax_img.set_title(f"cam {c} image-only") + ax_img.set_xticks([]) + ax_img.set_yticks([]) + + # Row block 3: depth-only visualization. + ax_depth = axes[(base_rows * 2) + (c // cols), c % cols] + ax_depth.imshow( + d, + cmap="turbo", + vmin=0, + vmax=self.max_depth, + interpolation="nearest", + ) + ax_depth.set_title(f"cam {c} depth-only") + ax_depth.set_xticks([]) + ax_depth.set_yticks([]) + + # Hide any unused subplots when n doesn't fill the grid. + for c in range(self.num_cameras, base_rows * cols): + axes[c // cols, c % cols].axis("off") + axes[base_rows + (c // cols), c % cols].axis("off") + axes[(base_rows * 2) + (c // cols), c % cols].axis("off") + + # Shared depth colorbar with numeric values. + depth_mappable = plt.cm.ScalarMappable(cmap="turbo", norm=plt.Normalize(vmin=0, vmax=self.max_depth)) + depth_mappable.set_array([]) + cbar = fig.colorbar(depth_mappable, ax=axes, location="right", fraction=0.02, pad=0.02) + cbar.set_label("Depth (m)") + + fig.suptitle(f"gt_depths — {self._depth_idx}") + fig.tight_layout(rect=[0, 0, 0.96, 0.97]) + + self._depth_idx += 1 + out_path = self.visualize_dir / f"{self._depth_idx:06d}_gt_depths.png" + fig.savefig(out_path, dpi=120, bbox_inches="tight") + plt.close(fig) + print(f"Saved gt_depths visualization to {out_path}") diff --git a/projects/BEVFusion/bevfusion/ops/__init__.py b/projects/BEVFusion/bevfusion/ops/__init__.py index e08abbc6d..f74f0edbb 100644 --- a/projects/BEVFusion/bevfusion/ops/__init__.py +++ b/projects/BEVFusion/bevfusion/ops/__init__.py @@ -1,4 +1,12 @@ from .bev_pool import bev_pool +from .bev_pool_v2 import bev_pool_v2 from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization -__all__ = ["bev_pool", "Voxelization", "voxelization", "dynamic_scatter", "DynamicScatter"] +__all__ = [ + "bev_pool", + "bev_pool_v2", + "Voxelization", + "voxelization", + "dynamic_scatter", + "DynamicScatter", +] diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py new file mode 100644 index 000000000..ff2fdfff7 --- /dev/null +++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py @@ -0,0 +1,3 @@ +from .bev_pool_v2 import bev_pool_v2 + +__all__ = ["bev_pool_v2"] diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py new file mode 100644 index 000000000..af1ba15de --- /dev/null +++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py @@ -0,0 +1,191 @@ +# Copyright (c) Phigent Robotics. All rights reserved. + +import numpy as np +import torch + +from . import bev_pool_v2_ext + + +class QuickCumsumV2TrainingCuda(torch.autograd.Function): + r"""BEVPoolv2 implementation for Lift-Splat-Shoot view transformation. + + Please refer to the `paper `_ + """ + + @staticmethod + def forward( + ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths + ): + ranks_bev = ranks_bev.int() + depth = depth.contiguous().float() + feat = feat.contiguous().float() + ranks_depth = ranks_depth.contiguous().int() + ranks_feat = ranks_feat.contiguous().int() + interval_lengths = interval_lengths.contiguous().int() + interval_starts = interval_starts.contiguous().int() + + out = feat.new_zeros(bev_feat_shape) + + bev_pool_v2_ext.bev_pool_v2_forward( + depth, + feat, + out, + ranks_depth, + ranks_feat, + ranks_bev, + interval_lengths, + interval_starts, + ) + + ctx.save_for_backward(ranks_bev, depth, feat, ranks_feat, ranks_depth) + return out + + @staticmethod + def backward(ctx, out_grad): + ranks_bev, depth, feat, ranks_feat, ranks_depth = ctx.saved_tensors + + order = ranks_feat.argsort() + ranks_feat, ranks_depth, ranks_bev = ranks_feat[order], ranks_depth[order], ranks_bev[order] + kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) + kept[1:] = ranks_feat[1:] != ranks_feat[:-1] + interval_starts_bp = torch.where(kept)[0].int() + interval_lengths_bp = torch.zeros_like(interval_starts_bp) + interval_lengths_bp[:-1] = interval_starts_bp[1:] - interval_starts_bp[:-1] + interval_lengths_bp[-1] = ranks_bev.shape[0] - interval_starts_bp[-1] + + depth = depth.contiguous() + feat = feat.contiguous() + ranks_depth = ranks_depth.contiguous() + ranks_feat = ranks_feat.contiguous() + ranks_bev = ranks_bev.contiguous() + interval_lengths_bp = interval_lengths_bp.contiguous() + interval_starts_bp = interval_starts_bp.contiguous() + + depth_grad = depth.new_zeros(depth.shape) + feat_grad = feat.new_zeros(feat.shape) + out_grad = out_grad.contiguous() + bev_pool_v2_ext.bev_pool_v2_backward( + out_grad, + depth_grad, + feat_grad, + depth, + feat, + ranks_depth, + ranks_feat, + ranks_bev, + interval_lengths_bp, + interval_starts_bp, + ) + return depth_grad, feat_grad, None, None, None, None, None, None, None, None + + +class QuickCumsumV2Cuda(torch.autograd.Function): + + @staticmethod + def symbolic( + g, + depth, + feat, + ranks_depth, + ranks_feat, + ranks_bev, + interval_starts, + interval_lengths, + out_height=128, + out_width=128, + ): + """symbolic function for creating onnx op.""" + x = g.op( + "autoware::QuickCumsumV2Cuda", + depth, + feat, + ranks_depth, + ranks_feat, + ranks_bev, + interval_starts, + interval_lengths, + out_height_i=out_height, + out_width_i=out_width, + ) + + # features_shape = _get_tensor_sizes(feat) + # if features_shape is not None and hasattr(x.type(), "with_sizes"): + # output_type = x.type().with_sizes([B, D, H, W, _get_tensor_dim_size(x, -1)]) + # output.setType(output_type) + + @staticmethod + def forward( + ctx, + depth, # B,N,D,H,W + feat, # B,N,H,W,C + ranks_depth, + ranks_feat, + ranks_bev, + interval_starts, + interval_lengths, + out_height=128, + out_width=128, + ): + """run forward.""" + out = feat.new_zeros(depth.shape[0], 1, out_height, out_width, feat.shape[-1]) + bev_feat = bev_pool_v2_ext.bev_pool_v2_forward( + depth, + feat, + out, + ranks_depth, + ranks_feat, + ranks_bev, + interval_lengths, + interval_starts, + ) + return bev_feat + + @staticmethod + def backward(ctx, out_grad): + raise NotImplementedError + + +def bev_pool_v2( + depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, bev_feat_shape, is_training +): + # Always use full (B, Z, H, W, C) buffer; QuickCumsumV2Cuda (Z=1) is ONNX-only. + del is_training + x = QuickCumsumV2TrainingCuda.apply( + depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths + ) + + # Final shape: (B, C, Z, H, W) — matches LSSTransform v1 after permute + x = x.permute(0, 4, 1, 2, 3).contiguous() + return x + + +def test_bev_pool_v2(): + depth = np.array([0.3, 0.4, 0.2, 0.1, 0.7, 0.6, 0.8, 0.9]) + depth = torch.from_numpy(depth).float().cuda() + depth = depth.view(1, 1, 2, 2, 2).requires_grad_() + feat = torch.ones(size=[1, 1, 2, 2, 2], dtype=torch.float, device="cuda").requires_grad_() + ranks_depth = torch.from_numpy(np.array([0, 4, 1, 6])).int().cuda() + ranks_feat = torch.from_numpy(np.array([0, 0, 1, 2])).int().cuda() + ranks_bev = torch.from_numpy(np.array([0, 0, 1, 1])).int().cuda() + + kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) + kept[1:] = ranks_bev[1:] != ranks_bev[:-1] + interval_starts = torch.where(kept)[0].int() + if len(interval_starts) == 0: + return None, None, None, None, None + interval_lengths = torch.zeros_like(interval_starts) + interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] + interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1] + bev_feat = bev_pool_v2( + depth, feat, ranks_depth, ranks_feat, ranks_bev, (1, 1, 2, 2, 2), interval_starts, interval_lengths + ) + loss = torch.sum(bev_feat) + loss.backward() + assert loss == 4.4 + grad_depth = np.array([2.0, 2.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0]) + grad_depth = torch.from_numpy(grad_depth).float() + grad_depth = grad_depth.cuda().view(1, 1, 2, 2, 2) + assert depth.grad.allclose(grad_depth) + grad_feat = np.array([1.0, 1.0, 0.4, 0.4, 0.8, 0.8, 0.0, 0.0]) + grad_feat = torch.from_numpy(grad_feat).float().cuda().view(1, 1, 2, 2, 2) + assert feat.grad.allclose(grad_feat) diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp new file mode 100644 index 000000000..c7c38f695 --- /dev/null +++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp @@ -0,0 +1,111 @@ +// Copyright (c) Phigent Robotics. All rights reserved. +// Reference https://arxiv.org/abs/2211.17111 +#include +#include + +// CUDA function declarations +void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat, + const int* ranks_depth, const int* ranks_feat, const int* ranks_bev, + const int* interval_starts, const int* interval_lengths, float* out); + +void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad, + const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat, + const int* ranks_bev, const int* interval_starts, const int* interval_lengths, + float* depth_grad, float* feat_grad); + + +/* + Function: pillar pooling (forward, cuda) + Args: + depth : input depth, FloatTensor[n, d, h, w] + feat : input features, FloatTensor[n, h, w, c] + out : output features, FloatTensor[b, c, h_out, w_out] + ranks_depth : depth index of points, IntTensor[n_points] + ranks_feat : feat index of points, IntTensor[n_points] + ranks_bev : output index of points, IntTensor[n_points] + interval_lengths : starting position for pooled point, IntTensor[n_intervals] + interval_starts : how many points in each pooled point, IntTensor[n_intervals] + Return: +*/ +void bev_pool_v2_forward( + const at::Tensor _depth, + const at::Tensor _feat, + at::Tensor _out, + const at::Tensor _ranks_depth, + const at::Tensor _ranks_feat, + const at::Tensor _ranks_bev, + const at::Tensor _interval_lengths, + const at::Tensor _interval_starts +) { + int c = _feat.size(4); + int n_intervals = _interval_lengths.size(0); + const at::cuda::OptionalCUDAGuard device_guard(device_of(_depth)); + const float* depth = _depth.data_ptr(); + const float* feat = _feat.data_ptr(); + const int* ranks_depth = _ranks_depth.data_ptr(); + const int* ranks_feat = _ranks_feat.data_ptr(); + const int* ranks_bev = _ranks_bev.data_ptr(); + + const int* interval_lengths = _interval_lengths.data_ptr(); + const int* interval_starts = _interval_starts.data_ptr(); + + float* out = _out.data_ptr(); + bev_pool_v2( + c, n_intervals, depth, feat, ranks_depth, ranks_feat, + ranks_bev, interval_starts, interval_lengths, out + ); +} + + +/* + Function: pillar pooling (backward, cuda) + Args: + out_grad : grad of output bev feature, FloatTensor[b, c, h_out, w_out] + depth_grad : grad of input depth, FloatTensor[n, d, h, w] + feat_grad : grad of input feature, FloatTensor[n, h, w, c] + depth : input depth, FloatTensor[n, d, h, w] + feat : input features, FloatTensor[n, h, w, c] + ranks_depth : depth index of points, IntTensor[n_points] + ranks_feat : feat index of points, IntTensor[n_points] + ranks_bev : output index of points, IntTensor[n_points] + interval_lengths : starting position for pooled point, IntTensor[n_intervals] + interval_starts : how many points in each pooled point, IntTensor[n_intervals] +*/ +void bev_pool_v2_backward( + const at::Tensor _out_grad, + at::Tensor _depth_grad, + at::Tensor _feat_grad, + const at::Tensor _depth, + const at::Tensor _feat, + const at::Tensor _ranks_depth, + const at::Tensor _ranks_feat, + const at::Tensor _ranks_bev, + const at::Tensor _interval_lengths, + const at::Tensor _interval_starts +) { + int c = _out_grad.size(4); + int n_intervals = _interval_lengths.size(0); + const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad)); + const float* out_grad = _out_grad.data_ptr(); + float* depth_grad = _depth_grad.data_ptr(); + float* feat_grad = _feat_grad.data_ptr(); + const float* depth = _depth.data_ptr(); + const float* feat = _feat.data_ptr(); + const int* ranks_depth = _ranks_depth.data_ptr(); + const int* ranks_feat = _ranks_feat.data_ptr(); + const int* ranks_bev = _ranks_bev.data_ptr(); + const int* interval_lengths = _interval_lengths.data_ptr(); + const int* interval_starts = _interval_starts.data_ptr(); + + bev_pool_v2_grad( + c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat, + ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad + ); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("bev_pool_v2_forward", &bev_pool_v2_forward, + "bev_pool_v2_forward"); + m.def("bev_pool_v2_backward", &bev_pool_v2_backward, + "bev_pool_v2_backward"); +} diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu new file mode 100644 index 000000000..7fa3179b7 --- /dev/null +++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu @@ -0,0 +1,140 @@ +// Copyright (c) Phigent Robotics. All rights reserved. +// Reference https://arxiv.org/abs/2211.17111 + +#include +#include + +/* + Function: pillar pooling + Args: + c : number of channels + n_intervals : number of unique points + depth : input depth, FloatTensor[b,n,d,h,w] + feat : input feat, FloatTensor[b,n,h,w,c] + ranks_depth : input index of depth, IntTensor[n] + ranks_feat : input index of feat, IntTensor[n] + ranks_bev : output index, IntTensor[n] + interval_lengths : starting position for pooled point, IntTensor[n_intervals] + interval_starts : how many points in each pooled point, IntTensor[n_intervals] + out : output features, FloatTensor[b, d, h, w, c] +*/ +__global__ void bev_pool_v2_kernel(int c, int n_intervals, + const float *__restrict__ depth, + const float *__restrict__ feat, + const int *__restrict__ ranks_depth, + const int *__restrict__ ranks_feat, + const int *__restrict__ ranks_bev, + const int *__restrict__ interval_starts, + const int *__restrict__ interval_lengths, + float* __restrict__ out) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int index = idx / c; + int cur_c = idx % c; + if (index >= n_intervals) return; + int interval_start = interval_starts[index]; + int interval_length = interval_lengths[index]; + float psum = 0; + const float* cur_depth; + const float* cur_feat; + for(int i = 0; i < interval_length; i++){ + cur_depth = depth + ranks_depth[interval_start+i]; + cur_feat = feat + ranks_feat[interval_start+i] * c + cur_c; + psum += *cur_feat * *cur_depth; + } + + const int* cur_rank = ranks_bev + interval_start; + float* cur_out = out + *cur_rank * c + cur_c; + *cur_out = psum; +} + + +/* + Function: pillar pooling backward + Args: + c : number of channels + n_intervals : number of unique points + out_grad : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c] + depth : input depth, FloatTensor[b,n,d,h,w] + feat : input feat, FloatTensor[b,n,h,w,c] + ranks_depth : input index of depth, IntTensor[n] + ranks_feat : input index of feat, IntTensor[n] + ranks_bev : output index, IntTensor[n] + interval_lengths : starting position for pooled point, IntTensor[n_intervals] + interval_starts : how many points in each pooled point, IntTensor[n_intervals] + depth_grad : gradient of the depth fmap, FloatTensor + feat_grad : gradient of the feature fmap, FloatTensor +*/ +__global__ void bev_pool_grad_kernel(int c, int n_intervals, + const float *__restrict__ out_grad, + const float *__restrict__ depth, + const float *__restrict__ feat, + const int *__restrict__ ranks_depth, + const int *__restrict__ ranks_feat, + const int *__restrict__ ranks_bev, + const int *__restrict__ interval_starts, + const int *__restrict__ interval_lengths, + float* __restrict__ depth_grad, + float* __restrict__ feat_grad) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n_intervals) return; + int interval_start = interval_starts[idx]; + int interval_length = interval_lengths[idx]; + + const int* cur_rank; + const float* cur_out_grad; + const float* cur_out_grad_start; + + const float* cur_feat; + const float* cur_feat_start; + float* cur_depth_grad; + float grad_sum; + for(int i = 0; i < interval_length; i++){ + cur_rank = ranks_bev + interval_start + i; + cur_out_grad_start = out_grad + * cur_rank * c; + cur_feat_start = feat + ranks_feat[interval_start+i] * c; + + grad_sum = 0; + for(int cur_c = 0; cur_c < c; cur_c++){ + cur_out_grad = cur_out_grad_start + cur_c; + cur_feat = cur_feat_start + cur_c; + grad_sum += *cur_out_grad * *cur_feat; + } + + cur_depth_grad = depth_grad + ranks_depth[interval_start+i]; + *cur_depth_grad = grad_sum; + } + + float* cur_feat_grad; + const float* cur_depth; + for(int cur_c = 0; cur_c < c; cur_c++){ + grad_sum = 0; + for(int i = 0; i < interval_length; i++){ + cur_rank = ranks_bev + interval_start + i; + cur_out_grad = out_grad + *cur_rank * c + cur_c; + + cur_depth = depth + ranks_depth[interval_start+i]; + grad_sum += *cur_out_grad * *cur_depth; + } + cur_feat_grad = feat_grad + ranks_feat[interval_start] * c + cur_c ; + * cur_feat_grad = grad_sum; + } +} + + + +void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat, const int* ranks_depth, + const int* ranks_feat, const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* out) { + bev_pool_v2_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>( + c, n_intervals, depth, feat, ranks_depth, ranks_feat, + ranks_bev, interval_starts, interval_lengths, out + ); +} + +void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad, + const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat, + const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* depth_grad, float* feat_grad) { + bev_pool_grad_kernel<<<(int)ceil(((double)n_intervals / 256)), 256>>>( + c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat, + ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad + ); +} diff --git a/projects/BEVFusion/bevfusion/ops/topk/__init__.py b/projects/BEVFusion/bevfusion/ops/topk/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/projects/BEVFusion/bevfusion/ops/topk/topk.py b/projects/BEVFusion/bevfusion/ops/topk/topk.py new file mode 100644 index 000000000..f0f9a8779 --- /dev/null +++ b/projects/BEVFusion/bevfusion/ops/topk/topk.py @@ -0,0 +1,46 @@ +""" +This file is used to write functions to deploy custom plugins to support Autoware, for example, TopK. +""" + +import torch +from torch.autograd import Function +from torch.onnx.symbolic_helper import _get_tensor_sizes + + +class TopK(Function): + + @staticmethod + def symbolic( + g, + x: torch.Tensor, + k: int, + dim: int, + sorted: bool = False, + ): + + output = g.op( + "autoware::Argsort", + x, + outputs=1, + ) + x_shape = _get_tensor_sizes(x) + if x_shape is not None and hasattr(output.type(), "with_sizes"): + output_type = x.type().with_sizes(x_shape) + output.setType(output_type) + # Argsort from Autoware is in ascending order, so we need to return the last k elements. + return output[-k:] + + @staticmethod + def forward( + ctx, + x: torch.Tensor, + k: int, + dim: int, + sorted: bool = False, + ): + _, indices = torch.topk(x, k=k, dim=dim, largest=True, sorted=sorted) + return indices + + +def topk(x: torch.Tensor, k: int, dim: int, sorted: bool = False): + return TopK.apply(x, k, dim, sorted) diff --git a/projects/BEVFusion/bevfusion/sparse_encoder.py b/projects/BEVFusion/bevfusion/sparse_encoder.py index 019cb630c..6bf0592b6 100644 --- a/projects/BEVFusion/bevfusion/sparse_encoder.py +++ b/projects/BEVFusion/bevfusion/sparse_encoder.py @@ -1,4 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. + +import os +from typing import Dict, Optional + +import numpy as np +import torch from mmdet3d.models.layers import make_sparse_convmodule from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE from mmdet3d.models.middle_encoders import SparseEncoder @@ -9,8 +15,7 @@ else: from mmcv.ops import SparseConvTensor -import numpy as np -import torch +from .custom_sparse_conv_tensor import sparse_to_dense @MODELS.register_module() @@ -24,6 +29,7 @@ class BEVFusionSparseEncoder(SparseEncoder): Args: in_channels (int): The number of input channels. sparse_shape (list[int]): The sparse shape of input tensor. + dense_output_shape (list[int]): The final shape of the dense output tensor. order (list[str], optional): Order of conv module. Defaults to ('conv', 'norm', 'act'). norm_cfg (dict, optional): Config of normalization layer. Defaults to @@ -47,10 +53,8 @@ class BEVFusionSparseEncoder(SparseEncoder): def __init__( self, in_channels, - aug_features_min_values, - aug_features_max_values, - num_aug_features, sparse_shape, + dense_output_shapes, order=("conv", "norm", "act"), norm_cfg=dict(type="BN1d", eps=1e-3, momentum=0.01), base_channels=16, @@ -63,10 +67,8 @@ def __init__( super(SparseEncoder, self).__init__() assert block_type in ["conv_module", "basicblock"] self.sparse_shape = sparse_shape + self.dense_output_shapes = dense_output_shapes self.in_channels = in_channels - self.register_buffer("aug_features_min_values", torch.tensor(aug_features_min_values)) - self.register_buffer("aug_features_max_values", torch.tensor(aug_features_max_values)) - self.num_aug_features = num_aug_features self.order = order self.base_channels = base_channels self.output_channels = output_channels @@ -77,10 +79,6 @@ def __init__( self.return_middle_feats = return_middle_feats # Spconv init all weight on its own - if num_aug_features: - self.in_channels = in_channels * num_aug_features * 2 - self.register_buffer("exponents", (2 ** torch.arange(0, num_aug_features).float())) - assert isinstance(order, tuple) and len(order) == 3 assert set(order) == {"conv", "norm", "act"} @@ -140,16 +138,6 @@ def forward(self, voxel_features, coors, batch_size): output features. When self.return_middle_feats is True, the module returns middle features. """ - - if self.num_aug_features: - num_points = voxel_features.shape[0] - x = (voxel_features - self.aug_features_min_values.view(1, -1)) / ( - self.aug_features_max_values - self.aug_features_min_values - ).view(1, -1) - y = x.reshape(-1, 1) * np.pi * self.exponents.reshape(1, -1) - y = y.reshape(num_points, -1) - voxel_features = torch.cat([torch.cos(y), torch.sin(y)], dim=1) - coors = coors.int() input_sp_tensor = SparseConvTensor(voxel_features, coors, self.sparse_shape, batch_size) x = self.conv_input(input_sp_tensor) @@ -162,11 +150,16 @@ def forward(self, voxel_features, coors, batch_size): # for detection head # [200, 176, 5] -> [200, 176, 2] out = self.conv_out(encode_features[-1]) - spatial_features = out.dense() - N, C, H, W, D = spatial_features.shape - spatial_features = spatial_features.permute(0, 1, 4, 2, 3).contiguous() - spatial_features = spatial_features.view(N, C * D, H, W) + spatial_features = sparse_to_dense(out, batch_size, self.dense_output_shapes, self.output_channels) + # spatial_features = out.dense(channels_first=False) + spatial_features = spatial_features.permute(0, 4, 3, 1, 2).contiguous() + spatial_features = spatial_features.view( + batch_size, + self.output_channels * self.dense_output_shapes[2], + self.dense_output_shapes[0], + self.dense_output_shapes[1], + ) if self.return_middle_feats: return spatial_features, encode_features diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py index 7e9faca24..31d0cc417 100644 --- a/projects/BEVFusion/bevfusion/transforms_3d.py +++ b/projects/BEVFusion/bevfusion/transforms_3d.py @@ -188,6 +188,19 @@ def transform(self, input_dict: dict) -> dict: return input_dict +@TRANSFORMS.register_module() +class BEVFusionRemoveLiDARPoints(BaseTransform): + """Remove LiDAR points from the data.""" + + def __init__(self): + super().__init__() + + def transform(self, results: Dict[str, Any]) -> Dict[str, Any]: + if "points" in results: + results["points"] = None + return results + + @TRANSFORMS.register_module() class GridMask(BaseTransform): diff --git a/projects/BEVFusion/bevfusion/utils.py b/projects/BEVFusion/bevfusion/utils.py index c47604dbd..39c6a0ded 100644 --- a/projects/BEVFusion/bevfusion/utils.py +++ b/projects/BEVFusion/bevfusion/utils.py @@ -85,26 +85,31 @@ def decode(self, heatmap, rot, dim, center, height, vel, filter=False): final_box_preds = torch.cat([center, height, dim, rot, vel], dim=1).permute(0, 2, 1) predictions_dicts = [] - for i in range(heatmap.shape[0]): - boxes3d = final_box_preds[i] - scores = final_scores[i] - labels = final_preds[i] - predictions_dict = {"bboxes": boxes3d, "scores": scores, "labels": labels} - predictions_dicts.append(predictions_dict) - - if filter is False: + if not filter: + for i in range(heatmap.shape[0]): + boxes3d = final_box_preds[i] + scores = final_scores[i] + labels = final_preds[i] + predictions_dict = {"bboxes": boxes3d, "scores": scores, "labels": labels} + predictions_dicts.append(predictions_dict) return predictions_dicts # use score threshold if self.score_threshold is not None: - thresh_mask = final_scores > self.score_threshold + if isinstance(self.score_threshold, float): + thresh_mask = final_scores > self.score_threshold + elif isinstance(self.score_threshold, (list, tuple)): + score_threshold = final_scores.new_tensor(self.score_threshold) + thresh_mask = final_scores > score_threshold[final_preds] + else: + raise ValueError("score_threshold must be a float or list") + predictions_dicts = [] if self.post_center_range is not None: self.post_center_range = torch.tensor(self.post_center_range, device=heatmap.device) mask = (final_box_preds[..., :3] >= self.post_center_range[:3]).all(2) mask &= (final_box_preds[..., :3] <= self.post_center_range[3:]).all(2) - predictions_dicts = [] for i in range(heatmap.shape[0]): cmask = mask[i, :] if self.score_threshold: @@ -114,7 +119,6 @@ def decode(self, heatmap, rot, dim, center, height, vel, filter=False): scores = final_scores[i, cmask] labels = final_preds[i, cmask] predictions_dict = {"bboxes": boxes3d, "scores": scores, "labels": labels} - predictions_dicts.append(predictions_dict) else: raise NotImplementedError( diff --git a/projects/BEVFusion/configs/deploy/bevfusion_main_body_lidar_only_intensity_tensorrt_dynamic.py b/projects/BEVFusion/configs/deploy/bevfusion_main_body_lidar_only_intensity_tensorrt_dynamic.py index e22e0f41b..2652b3965 100644 --- a/projects/BEVFusion/configs/deploy/bevfusion_main_body_lidar_only_intensity_tensorrt_dynamic.py +++ b/projects/BEVFusion/configs/deploy/bevfusion_main_body_lidar_only_intensity_tensorrt_dynamic.py @@ -1,11 +1,7 @@ codebase_config = dict(type="mmdet3d", task="VoxelDetection", model_type="end2end") custom_imports = dict( - imports=[ - "projects.BEVFusion.deploy", - "projects.BEVFusion.bevfusion", - "projects.SparseConvolution", - ], + imports=["projects.BEVFusion.deploy", "projects.BEVFusion.bevfusion", "projects.SparseConvolution"], allow_failed_imports=False, ) @@ -29,7 +25,7 @@ type="onnx", export_params=True, keep_initializers_as_inputs=False, - opset_version=17, + opset_version=18, save_file="bevfusion_lidar_intensity.onnx", input_names=["voxels", "coors", "num_points_per_voxel"], output_names=["bbox_pred", "score", "label_pred"], @@ -45,5 +41,5 @@ }, }, input_shape=None, - verbose=True, + verbose=False, ) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_120m.py similarity index 75% rename from projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py rename to projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_120m.py index e65c52ece..987c13393 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_120m.py @@ -1,9 +1,8 @@ _base_ = [ "../../../../../autoware_ml/configs/detection3d/default_runtime.py", "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py", - "../default/pipelines/default_camera_lidar_intensity_120m.py", - "../default/models/default_camera_swin_fpn_120m.py", - "../default/schedulers/default_30e_8xb8_adamw_linear_cosine.py", + "../default/pipelines/cameras/default_camera_120m.py", + "../default/schedulers/default_30e_8xb16_adamw_linear_cosine.py", "../default/default_misc.py", ] @@ -13,35 +12,7 @@ # user setting data_root = "data/t4dataset/" -info_directory_path = "info/user_name/" - -experiment_group_name = "bevfusion_camera/j6gen2_base/" + _base_.dataset_type -experiment_name = "bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m" -work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name - -# model parameter -model = dict( - type="BEVFusion", - view_transform=dict(image_size=_base_.image_size), - bbox_head=dict( - class_names=_base_.class_names, - in_channels=80, - train_cfg=dict( - point_cloud_range=_base_.point_cloud_range, - grid_size=_base_.grid_size, - voxel_size=_base_.voxel_size, - ), - test_cfg=dict( - grid_size=_base_.grid_size, - voxel_size=_base_.voxel_size[0:2], - pc_range=_base_.point_cloud_range[0:2], - ), - bbox_coder=dict( - pc_range=_base_.point_cloud_range[0:2], - voxel_size=_base_.voxel_size[0:2], - ), - ), -) +info_directory_path = "info/kokseang_2_8_0/" # Dataset parameters train_dataloader = dict( diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_50m.py new file mode 100644 index 000000000..ffe9f1363 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb16_j6gen2_base_50m.py @@ -0,0 +1,108 @@ +_base_ = [ + "../../../../../autoware_ml/configs/detection3d/default_runtime.py", + "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py", + "../default/pipelines/cameras/default_camera_50m.py", + "../default/schedulers/default_30e_8xb16_adamw_linear_cosine.py", + "../default/default_misc.py", +] + +custom_imports = dict(imports=["projects.BEVFusion.bevfusion"], allow_failed_imports=False) +custom_imports["imports"] += _base_.custom_imports["imports"] +custom_imports["imports"] += ["autoware_ml.detection3d.datasets.transforms"] + +# user setting +data_root = "data/t4dataset/" +info_directory_path = "info/kokseang_2_8/" + +# Dataset parameters +train_dataloader = dict( + batch_size=_base_.train_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=True), + dataset=dict( + type=_base_.dataset_type, + pipeline=_base_.train_pipeline, + modality=_base_.input_modality, + backend_args=_base_.backend_args, + data_root=data_root, + ann_file=info_directory_path + _base_.info_train_file_name, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + test_mode=False, + data_prefix=_base_.data_prefix, + box_type_3d="LiDAR", + filter_cfg=_base_.filter_cfg, + ), +) + +val_dataloader = dict( + batch_size=_base_.test_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + ann_file=info_directory_path + _base_.info_val_file_name, + pipeline=_base_.test_pipeline, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + modality=_base_.input_modality, + data_prefix=_base_.data_prefix, + test_mode=True, + box_type_3d="LiDAR", + backend_args=_base_.backend_args, + ), +) + +test_dataloader = dict( + batch_size=_base_.test_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + ann_file=info_directory_path + _base_.info_test_file_name, + pipeline=_base_.test_pipeline, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + modality=_base_.input_modality, + data_prefix=_base_.data_prefix, + test_mode=True, + box_type_3d="LiDAR", + backend_args=_base_.backend_args, + ), +) + +val_evaluator = dict( + type="T4Metric", + data_root=data_root, + ann_file=data_root + info_directory_path + _base_.info_val_file_name, + metric="bbox", + backend_args=_base_.backend_args, + class_names=_base_.class_names, + name_mapping=_base_.name_mapping, + eval_class_range=_base_.eval_class_range, + filter_attributes=_base_.filter_attributes, +) + +test_evaluator = dict( + type="T4Metric", + data_root=data_root, + ann_file=data_root + info_directory_path + _base_.info_test_file_name, + metric="bbox", + backend_args=_base_.backend_args, + class_names=_base_.class_names, + name_mapping=_base_.name_mapping, + eval_class_range=_base_.eval_class_range, + filter_attributes=_base_.filter_attributes, + save_csv=True, +) + +default_hooks = dict( + logger=dict(type="LoggerHook", interval=50), + checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"), +) +log_processor = dict(window_size=50) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_120m.py new file mode 100644 index 000000000..7a81be126 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_120m.py @@ -0,0 +1,108 @@ +_base_ = [ + "../../../../../autoware_ml/configs/detection3d/default_runtime.py", + "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/base.py", + "../default/pipelines/cameras/default_camera_120m.py", + "../default/schedulers/default_50e_8xb16_adamw_linear_cosine.py", + "../default/default_misc.py", +] + +custom_imports = dict(imports=["projects.BEVFusion.bevfusion"], allow_failed_imports=False) +custom_imports["imports"] += _base_.custom_imports["imports"] +custom_imports["imports"] += ["autoware_ml.detection3d.datasets.transforms"] + +# user setting +data_root = "data/t4dataset/" +info_directory_path = "info/kokseang_2_8_0/" + +# Dataset parameters +train_dataloader = dict( + batch_size=_base_.train_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=True), + dataset=dict( + type=_base_.dataset_type, + pipeline=_base_.train_pipeline, + modality=_base_.input_modality, + backend_args=_base_.backend_args, + data_root=data_root, + ann_file=info_directory_path + _base_.info_train_file_name, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + test_mode=False, + data_prefix=_base_.data_prefix, + box_type_3d="LiDAR", + filter_cfg=_base_.filter_cfg, + ), +) + +val_dataloader = dict( + batch_size=_base_.test_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + ann_file=info_directory_path + _base_.info_val_file_name, + pipeline=_base_.test_pipeline, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + modality=_base_.input_modality, + data_prefix=_base_.data_prefix, + test_mode=True, + box_type_3d="LiDAR", + backend_args=_base_.backend_args, + ), +) + +test_dataloader = dict( + batch_size=_base_.test_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + ann_file=info_directory_path + _base_.info_test_file_name, + pipeline=_base_.test_pipeline, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + modality=_base_.input_modality, + data_prefix=_base_.data_prefix, + test_mode=True, + box_type_3d="LiDAR", + backend_args=_base_.backend_args, + ), +) + +val_evaluator = dict( + type="T4Metric", + data_root=data_root, + ann_file=data_root + info_directory_path + _base_.info_val_file_name, + metric="bbox", + backend_args=_base_.backend_args, + class_names=_base_.class_names, + name_mapping=_base_.name_mapping, + eval_class_range=_base_.eval_class_range, + filter_attributes=_base_.filter_attributes, +) + +test_evaluator = dict( + type="T4Metric", + data_root=data_root, + ann_file=data_root + info_directory_path + _base_.info_test_file_name, + metric="bbox", + backend_args=_base_.backend_args, + class_names=_base_.class_names, + name_mapping=_base_.name_mapping, + eval_class_range=_base_.eval_class_range, + filter_attributes=_base_.filter_attributes, + save_csv=True, +) + +default_hooks = dict( + logger=dict(type="LoggerHook", interval=50), + checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"), +) +log_processor = dict(window_size=50) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_50m.py new file mode 100644 index 000000000..4b79e2102 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb16_base_50m.py @@ -0,0 +1,108 @@ +_base_ = [ + "../../../../../autoware_ml/configs/detection3d/default_runtime.py", + "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/base.py", + "../default/pipelines/cameras/default_camera_50m.py", + "../default/schedulers/default_50e_8xb16_adamw_linear_cosine.py", + "../default/default_misc.py", +] + +custom_imports = dict(imports=["projects.BEVFusion.bevfusion"], allow_failed_imports=False) +custom_imports["imports"] += _base_.custom_imports["imports"] +custom_imports["imports"] += ["autoware_ml.detection3d.datasets.transforms"] + +# user setting +data_root = "data/t4dataset/" +info_directory_path = "info/kokseang_2_8_0/" + +# Dataset parameters +train_dataloader = dict( + batch_size=_base_.train_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=True), + dataset=dict( + type=_base_.dataset_type, + pipeline=_base_.train_pipeline, + modality=_base_.input_modality, + backend_args=_base_.backend_args, + data_root=data_root, + ann_file=info_directory_path + _base_.info_train_file_name, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + test_mode=False, + data_prefix=_base_.data_prefix, + box_type_3d="LiDAR", + filter_cfg=_base_.filter_cfg, + ), +) + +val_dataloader = dict( + batch_size=_base_.test_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + ann_file=info_directory_path + _base_.info_val_file_name, + pipeline=_base_.test_pipeline, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + modality=_base_.input_modality, + data_prefix=_base_.data_prefix, + test_mode=True, + box_type_3d="LiDAR", + backend_args=_base_.backend_args, + ), +) + +test_dataloader = dict( + batch_size=_base_.test_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + ann_file=info_directory_path + _base_.info_test_file_name, + pipeline=_base_.test_pipeline, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + modality=_base_.input_modality, + data_prefix=_base_.data_prefix, + test_mode=True, + box_type_3d="LiDAR", + backend_args=_base_.backend_args, + ), +) + +val_evaluator = dict( + type="T4Metric", + data_root=data_root, + ann_file=data_root + info_directory_path + _base_.info_val_file_name, + metric="bbox", + backend_args=_base_.backend_args, + class_names=_base_.class_names, + name_mapping=_base_.name_mapping, + eval_class_range=_base_.eval_class_range, + filter_attributes=_base_.filter_attributes, +) + +test_evaluator = dict( + type="T4Metric", + data_root=data_root, + ann_file=data_root + info_directory_path + _base_.info_test_file_name, + metric="bbox", + backend_args=_base_.backend_args, + class_names=_base_.class_names, + name_mapping=_base_.name_mapping, + eval_class_range=_base_.eval_class_range, + filter_attributes=_base_.filter_attributes, + save_csv=True, +) + +default_hooks = dict( + logger=dict(type="LoggerHook", interval=50), + checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"), +) +log_processor = dict(window_size=50) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_30e_8xb16_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_30e_8xb16_j6gen2_base_120m.py new file mode 100644 index 000000000..92501d169 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_30e_8xb16_j6gen2_base_120m.py @@ -0,0 +1,31 @@ +_base_ = [ + "../default_bevfusion_camera_30e_8xb16_j6gen2_base_120m.py", + "../../default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py", +] + +experiment_group_name = "bevfusion_camera/j6gen2_base/" + _base_.dataset_type +experiment_name = "bevfusion_camera_resnet50_fpn_depthlss_30e_8xb16_j6gen2_base_120m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_50e_8xb16_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_50e_8xb16_base_120m.py new file mode 100644 index 000000000..47c91cfb3 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_depthlss_50e_8xb16_base_120m.py @@ -0,0 +1,31 @@ +_base_ = [ + "../default_bevfusion_camera_50e_8xb16_base_120m.py", + "../../default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py", +] + +experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type +experiment_name = "bevfusion_camera_resnet50_fpn_depthlss_50e_8xb16_base_120m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_30e_2xb16_j6gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_30e_2xb16_j6gen2_base_50m.py new file mode 100644 index 000000000..9074f14d2 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_30e_2xb16_j6gen2_base_50m.py @@ -0,0 +1,31 @@ +_base_ = [ + "../default_bevfusion_camera_30e_8xb16_j6gen2_base_50m.py", + "../../default/models/resnet50/camera_resnet50_fpn_lss_depth_50m.py", +] + +experiment_group_name = "bevfusion_camera/j6gen2_base_depth_adjust_v2/" + _base_.dataset_type +experiment_name = "bevfusion_camera_resnet50_fpn_lss_depth_30e_8xb16_j6gen2_base_50m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_50e_8xb16_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_50e_8xb16_base_50m.py new file mode 100644 index 000000000..40f008b34 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_50e_8xb16_base_50m.py @@ -0,0 +1,31 @@ +_base_ = [ + "../default_bevfusion_camera_50e_8xb16_base_50m.py", + "../../default/models/resnet50/camera_resnet50_fpn_camera_aware_lssv2_50m.py", +] + +experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type +experiment_name = "bevfusion_camera_resnet50_fpn_camera_depth_aware_lssv2_50e_8xb16_base_50m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_depthlss_50e_8xb16_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_depthlss_50e_8xb16_base_120m.py new file mode 100644 index 000000000..56c2930bb --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_depthlss_50e_8xb16_base_120m.py @@ -0,0 +1,32 @@ +_base_ = [ + "../default_bevfusion_camera_50e_8xb16_base_50m.py", + "../../default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py", +] + +experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type +experiment_name = "bevfusion_camera_swin_fpn_depthlss_50e_8xb16_base_120m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + in_channels=80, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_base_50m.py new file mode 100644 index 000000000..8d1ff7681 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_base_50m.py @@ -0,0 +1,32 @@ +_base_ = [ + "../default_bevfusion_camera_50e_8xb16_base_50m.py", + "../../default/models/swin_transformer/camera_swin_fpn_lss_50m.py", +] + +experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type +experiment_name = "bevfusion_camera_swin_fpn_lss_50e_8xb16_base_50m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + in_channels=80, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_120m.py new file mode 100644 index 000000000..401ac7861 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_120m.py @@ -0,0 +1,32 @@ +_base_ = [ + "../default_bevfusion_camera_30e_8xb16_j6gen2_base_120m.py", + "../../default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py", +] + +experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type +experiment_name = "bevfusion_camera_swin_fpn_depthlss_50e_8xb16_j6gen2_base_120m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + in_channels=80, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_50m.py new file mode 100644 index 000000000..80e81be39 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_50m.py @@ -0,0 +1,32 @@ +_base_ = [ + "../default_bevfusion_camera_30e_8xb16_j6gen2_base_50m.py", + "../../default/models/swin_transformer/camera_swin_fpn_lss_50m.py", +] + +experiment_group_name = "bevfusion_camera/base/" + _base_.dataset_type +experiment_name = "bevfusion_camera_swin_fpn_lss_50e_8xb16_j6gen2_base_50m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + in_channels=80, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py index 4f81af760..a93b1d435 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py @@ -2,8 +2,8 @@ "../../../../../autoware_ml/configs/detection3d/default_runtime.py", "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py", "../default/pipelines/default_camera_lidar_intensity_120m.py", - "../default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py", - "../default/schedulers/default_20e_8xb8_adamw_linear_cosine.py", + "../default/models/default_camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py", + "../default/schedulers/default_20e_8xb16_adamw_linear_cosine.py", "../default/default_misc.py", ] diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py index 20c85b1d8..b8408956b 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py @@ -2,8 +2,8 @@ "../../../../../autoware_ml/configs/detection3d/default_runtime.py", "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py", "../default/pipelines/default_camera_lidar_intensity_120m.py", - "../default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py", - "../default/schedulers/default_20e_8xb8_adamw_linear_cosine.py", + "../default/models/default_camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py", + "../default/schedulers/default_20e_8xb16_adamw_linear_cosine.py", "../default/default_misc.py", ] diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py similarity index 88% rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m.py rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py index d32dc9c70..bcf9870c6 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py @@ -3,7 +3,7 @@ "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py", "../default/pipelines/default_lidar_intensity_120m.py", "../default/models/default_lidar_second_secfpn_120m.py", - "../default/schedulers/default_30e_8xb8_adamw_cosine.py", + "../default/schedulers/default_30e_8xb16_adamw_cosine.py", "../default/default_misc.py", ] @@ -13,10 +13,10 @@ # user setting data_root = "data/t4dataset/" -info_directory_path = "info/user_name/" +info_directory_path = "info/kokseang_2_8_1/" -experiment_group_name = "bevfusion_lidar_intensity/j6gen2_base/" + _base_.dataset_type -experiment_name = "lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m" +experiment_group_name = "bevfusion_lidar_intensity_2_8_3/j6gen2_base_normal_dense/" + _base_.dataset_type +experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m" work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name # model parameter @@ -25,22 +25,18 @@ voxelize_cfg=dict( point_cloud_range=_base_.point_cloud_range, voxel_size=_base_.voxel_size, - voxelize_reduce=True, ), - pts_voxel_encoder=dict(num_features=_base_.point_use_dim), - pts_middle_encoder=dict( - in_channels=_base_.point_use_dim, - sparse_shape=_base_.grid_size, - num_aug_features=5, + pts_voxel_encoder=dict( + in_channels=len(_base_.lidar_sweep_dims), # min-max normalization for x, y, z, intensity, time_lag, where the max of time lag technically is two seeps (200 ms) here - aug_features_min_values=[ + min_norm_values=[ _base_.point_cloud_range[0], _base_.point_cloud_range[1], _base_.point_cloud_range[2], 0.0, 0.0, ], - aug_features_max_values=[ + max_norm_values=[ _base_.point_cloud_range[3], _base_.point_cloud_range[4], _base_.point_cloud_range[5], @@ -48,6 +44,11 @@ 0.2, ], ), + pts_middle_encoder=dict( + in_channels=50, + sparse_shape=_base_.grid_size, + dense_output_shapes=_base_.sparse_dense_output_shapes, + ), bbox_head=dict( class_names=_base_.class_names, # Use class names to identify the correct class indices train_cfg=dict( @@ -164,4 +165,4 @@ ) log_processor = dict(window_size=50) -load_from = None +load_from = "work_dirs/bevfusion_lidar_2_8_2/base/T4Dataset/lidar_voxel_second_secfpn_50e_8xb16_base_120m/epoch_50.pth" diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m_t4metric_v2.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m_t4metric_v2.py similarity index 91% rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m_t4metric_v2.py rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m_t4metric_v2.py index 260080bc6..3bdda213e 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m_t4metric_v2.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m_t4metric_v2.py @@ -1,10 +1,10 @@ _base_ = [ - "./bevfusion_lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m.py", + "./bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py", ] # user setting -experiment_group_name = "bevfusion_lidar_intensity/j6gen2_base/" + _base_.dataset_type -experiment_name = "lidar_voxel_second_secfpn_30e_8xb8_j6gen2_base_120m_t4metric_v2" +experiment_group_name = "bevfusion_lidar_intensity_2_8_1/j6gen2_base/" + _base_.dataset_type +experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m_t4metric_v2" work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name # Add evaluator configs diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py similarity index 88% rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m.py rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py index 8fa1509b9..1f2acd6ab 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py @@ -3,7 +3,7 @@ "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py", "../default/pipelines/default_lidar_intensity_120m.py", "../default/models/default_lidar_second_secfpn_120m.py", - "../default/schedulers/default_30e_8xb8_adamw_cosine.py", + "../default/schedulers/default_30e_8xb16_adamw_cosine.py", "../default/default_misc.py", ] @@ -13,10 +13,10 @@ # user setting data_root = "data/t4dataset/" -info_directory_path = "info/user_name/" +info_directory_path = "info/kokseang_2_8_1/" -experiment_group_name = "bevfusion_lidar_intensity/jpntaxi_base/" + _base_.dataset_type -experiment_name = "lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m" +experiment_group_name = "bevfusion_lidar_intensity_2_8_2/jpntaxi_base/" + _base_.dataset_type +experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m" work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name # model parameter @@ -25,22 +25,18 @@ voxelize_cfg=dict( point_cloud_range=_base_.point_cloud_range, voxel_size=_base_.voxel_size, - voxelize_reduce=True, ), - pts_voxel_encoder=dict(num_features=_base_.point_use_dim), - pts_middle_encoder=dict( - in_channels=_base_.point_use_dim, - sparse_shape=_base_.grid_size, - num_aug_features=5, + pts_voxel_encoder=dict( + in_channels=len(_base_.lidar_sweep_dims), # min-max normalization for x, y, z, intensity, time_lag, where the max of time lag technically is two seeps (200 ms) here - aug_features_min_values=[ + min_norm_values=[ _base_.point_cloud_range[0], _base_.point_cloud_range[1], _base_.point_cloud_range[2], 0.0, 0.0, ], - aug_features_max_values=[ + max_norm_values=[ _base_.point_cloud_range[3], _base_.point_cloud_range[4], _base_.point_cloud_range[5], @@ -48,6 +44,11 @@ 0.2, ], ), + pts_middle_encoder=dict( + in_channels=50, + sparse_shape=_base_.grid_size, + dense_output_shapes=_base_.sparse_dense_output_shapes, + ), bbox_head=dict( class_names=_base_.class_names, # Use class names to identify the correct class indices train_cfg=dict( @@ -164,4 +165,6 @@ ) log_processor = dict(window_size=50) -load_from = None +load_from = ( + "work_dirs/bevfusion_lidar_2_8_0/base/T4Dataset/lidar_voxel_second_secfpn_50e_8xb16_base_120m/best_epoch_47.pth" +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m_t4metric_v2.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m_t4metric_v2.py similarity index 91% rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m_t4metric_v2.py rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m_t4metric_v2.py index 3d543e709..64d494655 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m_t4metric_v2.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m_t4metric_v2.py @@ -1,10 +1,10 @@ _base_ = [ - "./bevfusion_lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m.py", + "./bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py", ] # user setting -experiment_group_name = "bevfusion_lidar_intensity/jpntaxi_base/" + _base_.dataset_type -experiment_name = "lidar_voxel_second_secfpn_30e_8xb8_jpntaxi_base_120m_t4metric_v2" +experiment_group_name = "bevfusion_lidar_intensity_2_8_1/jpntaxi_base/" + _base_.dataset_type +experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m_t4metric_v2" work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name # Add evaluator configs diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py similarity index 85% rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m.py rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py index e8068332a..446649985 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py @@ -3,7 +3,7 @@ "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/base.py", "../default/pipelines/default_lidar_120m.py", "../default/models/default_lidar_second_secfpn_120m.py", - "../default/schedulers/default_50e_8xb8_adamw_cosine.py", + "../default/schedulers/default_50e_8xb16_adamw_cosine.py", "../default/default_misc.py", ] @@ -13,10 +13,10 @@ # user setting data_root = "data/t4dataset/" -info_directory_path = "info/user_name/" +info_directory_path = "info/kokseang_2_8_1/" -experiment_group_name = "bevfusion_lidar/base/" + _base_.dataset_type -experiment_name = "lidar_voxel_second_secfpn_50e_8xb8_base_120m" +experiment_group_name = "bevfusion_lidar_2_8_0/base_more_filters/" + _base_.dataset_type +experiment_name = "lidar_voxel_second_secfpn_50e_8xb16_base_120m" work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name # model parameter @@ -25,26 +25,17 @@ voxelize_cfg=dict( point_cloud_range=_base_.point_cloud_range, voxel_size=_base_.voxel_size, - voxelize_reduce=True, ), - pts_voxel_encoder=dict(num_features=_base_.point_use_dim), + pts_voxel_encoder=dict( + in_channels=len(_base_.lidar_sweep_dims), + # min-max normalization for x, y, z, time_lag, where the max of time lag technically is two seeps (200 ms) here + min_norm_values=[_base_.point_cloud_range[0], _base_.point_cloud_range[1], _base_.point_cloud_range[2], 0.0], + max_norm_values=[_base_.point_cloud_range[3], _base_.point_cloud_range[4], _base_.point_cloud_range[5], 0.2], + ), pts_middle_encoder=dict( - in_channels=_base_.point_use_dim, + in_channels=32, sparse_shape=_base_.grid_size, - num_aug_features=4, - # min-max normalization for x, y, z, time_lag, where the max of time lag technically is two seeps (200 ms) here - aug_features_min_values=[ - _base_.point_cloud_range[0], - _base_.point_cloud_range[1], - _base_.point_cloud_range[2], - 0.0, - ], - aug_features_max_values=[ - _base_.point_cloud_range[3], - _base_.point_cloud_range[4], - _base_.point_cloud_range[5], - 0.2, - ], + dense_output_shapes=_base_.sparse_dense_output_shapes, ), bbox_head=dict( class_names=_base_.class_names, # Use class names to identify the correct class indices @@ -161,3 +152,5 @@ checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"), ) log_processor = dict(window_size=50) + +resume = True diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m_t4metric_v2.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m_t4metric_v2.py similarity index 97% rename from projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m_t4metric_v2.py rename to projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m_t4metric_v2.py index efcd091f5..98a65a3f9 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb8_base_120m_t4metric_v2.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m_t4metric_v2.py @@ -18,7 +18,7 @@ frame_pass_fail_config = dict( target_labels=_base_.class_names, # Matching thresholds per class (must align with `plane_distance_thresholds` used in evaluation) - matching_threshold_list=[2.0, 2.0, 2.0, 2.0, 2.0], + matching_threshold_list=[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0], confidence_threshold_list=None, ) diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_resnet50_fpn_depthlss_lidar_second_secfpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/default_camera_resnet50_fpn_depthlss_lidar_second_secfpn_120m.py new file mode 100644 index 000000000..339f3e97e --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/models/default_camera_resnet50_fpn_depthlss_lidar_second_secfpn_120m.py @@ -0,0 +1,53 @@ +_base_ = [ + "./default_lidar_second_secfpn_120m.py", +] + +# Image network +model = dict( + data_preprocessor=dict( + type="Det3DDataPreprocessor", + pad_size_divisor=32, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=False, + rgb_to_bgr=False, + ), + img_backbone=dict( + pretrained="torchvision://resnet50", + type="ResNet", + depth=50, + num_stages=4, + out_indices=(2, 3), + frozen_stages=-1, + norm_cfg=dict(type="BN2d", requires_grad=True), + norm_eval=False, + with_cp=False, + style="pytorch", + init_cfg=dict( + type="Pretrained", + checkpoint="work_dirs/resnet50/mmdet_resnet50-19c8e357.pth", # noqa: E251 + ), + ), + img_neck=dict( + type="GeneralizedLSSFPN", + in_channels=[1024, 2048], + out_channels=256, + start_level=0, + num_outs=2, + norm_cfg=dict(type="BN2d", requires_grad=True), + act_cfg=dict(type="ReLU", inplace=True), + upsample_cfg=dict(mode="bilinear", align_corners=False), + ), + view_transform=dict( + type="DepthLSSTransform", + in_channels=256, + out_channels=80, + feature_size=[48, 96], + xbound=[-122.40, 122.40, 0.68], + ybound=[-122.40, 122.40, 0.68], + zbound=[-10.0, 10.0, 20.0], + dbound=[1.0, 130, 1.0], + downsample=2, + ), + fusion_layer=dict(type="ConvFuser", in_channels=[80, 256], out_channels=256, kernel_size=5, stride=2, padding=2), +) diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m.py index 809179b20..f11431814 100644 --- a/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m.py +++ b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m.py @@ -1,25 +1,25 @@ num_proposals = 500 -max_num_points = 10 +max_num_points = 32 max_voxels = [120000, 160000] +out_size_factor = 8 model = dict( type="BEVFusion", voxelize_cfg=dict( max_num_points=max_num_points, max_voxels=max_voxels, - voxelize_reduce=True, ), data_preprocessor=dict( type="Det3DDataPreprocessor", pad_size_divisor=32, ), - pts_voxel_encoder=dict(type="HardSimpleVFE"), + pts_voxel_encoder=dict( + type="HardSimpleVoxelSinCosEncoder", + in_channels=4, + ), pts_middle_encoder=dict( type="BEVFusionSparseEncoder", in_channels=5, - aug_features_min_values=[], - aug_features_max_values=[], - num_aug_features=0, order=("conv", "norm", "act"), norm_cfg=dict(type="BN1d", eps=0.001, momentum=0.01), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), @@ -69,7 +69,7 @@ ), train_cfg=dict( dataset="t4datasets", - out_size_factor=8, + out_size_factor=out_size_factor, gaussian_overlap=0.1, min_radius=2, pos_weight=-1, @@ -84,15 +84,18 @@ ), test_cfg=dict( dataset="t4datasets", - out_size_factor=8, - nms_type=None, # Set to "circle" for circle_nms + out_size_factor=out_size_factor, + nms_type="circle", # Set to "circle" for circle_nms # Set NMS for different clusters nms_clusters=[ - dict(class_names=["car", "truck", "bus"], nms_threshold=0.5), # It's radius if using circle_nms - dict(class_names=["bicycle"], nms_threshold=0.5), - dict(class_names=["pedestrian"], nms_threshold=0.175), - dict(class_names=["barrier"], nms_threshold=0.5), - dict(class_names=["traffic_cone"], nms_threshold=0.175), + # Sqrt(0.25) = 0.50 + dict( + class_names=["car", "truck", "bus"], class_indices=[0, 1, 2], nms_threshold=0.25, post_max_size=300 + ), # It's radius if using circle_nms + dict(class_names=["bicycle"], class_indices=[3], nms_threshold=0.0, post_max_size=50), + dict(class_names=["pedestrian"], class_indices=[4], nms_threshold=0.0, post_max_size=100), + dict(class_names=["traffic_cone"], class_indices=[5], nms_threshold=0.0, post_max_size=100), + dict(class_names=["barrier"], class_indices=[6], nms_threshold=0.0, post_max_size=50), ], ), dense_heatmap_pooling_classes=["car", "truck", "bus", "bicycle", "barrier"], # Use class indices for pooling @@ -100,7 +103,9 @@ bbox_coder=dict( type="TransFusionBBoxCoder", post_center_range=[-200.0, -200.0, -10.0, 200.0, 200.0, 10.0], - score_threshold=0.0, + # score_threshold=0.03, + # CAR, TRUCK, BUS, BICYCLE, PEDESTRIAN, TRAFFIC_CONE, BARRIER + score_threshold=[0.015, 0.010, 0.010, 0.020, 0.030, 0.040, 0.020], out_size_factor=8, code_size=10, ), @@ -112,8 +117,10 @@ reduction="mean", loss_weight=1.0, ), - loss_heatmap=dict(type="mmdet.GaussianFocalLoss", reduction="mean", loss_weight=1.0), + loss_iou=None, + loss_heatmap=dict(type="mmdet.GaussianFocalLoss", reduction="none", loss_weight=1.0), loss_bbox=dict(type="mmdet.L1Loss", reduction="mean", loss_weight=0.25), - partial_ignore_labels=None, + # partial_ + partial_ignore_labels=["traffic_cone", "barrier"], ), ) diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py new file mode 100644 index 000000000..e90687fe3 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py @@ -0,0 +1,10 @@ +_base_ = [ + "./default_lidar_second_secfpn_120m.py", +] + +model = dict( + bbox_head=dict( + common_heads=dict(center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2], iou=[1, 2]), + loss_iou=dict(type="mmdet.L1Loss", reduction="mean", loss_weight=1.0), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py new file mode 100644 index 000000000..c807668a3 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py @@ -0,0 +1,60 @@ +_base_ = [ + "../default_lidar_second_secfpn_120m.py", +] + +# Image network +model = dict( + # Remove all lidar related configs + voxelize_cfg=None, + pts_voxel_encoder=None, + pts_middle_encoder=None, + pts_neck=None, + pts_backbone=None, + data_preprocessor=dict( + type="Det3DDataPreprocessor", + pad_size_divisor=32, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=False, + rgb_to_bgr=False, + ), + img_backbone=dict( + type="mmdet.ResNet", + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type="BN2d", requires_grad=True), + norm_eval=False, + with_cp=True, + style="pytorch", + init_cfg=dict( + type="Pretrained", + checkpoint="work_dirs/resnet50/mmdet_resnet50-19c8e357.pth", # noqa: E251 + ), + ), + img_neck=dict( + type="GeneralizedLSSFPN", + in_channels=[512, 1024, 2048], + out_channels=256, + start_level=0, + num_outs=2, + norm_cfg=dict(type="BN2d", requires_grad=True), + act_cfg=dict(type="ReLU", inplace=True), + upsample_cfg=dict(mode="bilinear", align_corners=False), + ), + view_transform=dict( + type="DepthLSSTransform", + in_channels=256, + out_channels=80, + feature_size=[48, 96], + xbound=[-122.40, 122.40, 0.68], + ybound=[-122.40, 122.40, 0.68], + zbound=[-10.0, 10.0, 20.0], + dbound=[1.0, 130, 1.0], + downsample=2, + ), + bbox_head=dict( + in_channels=80, + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_50m.py b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_50m.py new file mode 100644 index 000000000..1457207c8 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_50m.py @@ -0,0 +1,24 @@ +_base_ = [ + "./camera_resnet50_fpn_depthlss_120m.py", +] +num_proposals = 200 + +# Image network +model = dict( + depth_gt_downsample=8, + loss_depth_weight=2.0, + view_transform=dict( + type="LSSTransformV2", + xbound=[-54.0, 54.0, 0.3], + ybound=[-54.0, 54.0, 0.3], + zbound=[-10.0, 10.0, 20.0], + dbound=[1.0, 60, 0.5], + downsample=2, + ), + bbox_head=dict( + num_proposals=num_proposals, + bbox_coder=dict( + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depth_50m.py b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depth_50m.py new file mode 100644 index 000000000..dd90ccb6e --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depth_50m.py @@ -0,0 +1,26 @@ +_base_ = [ + "./camera_resnet50_fpn_depthlss_120m.py", +] +num_proposals = 200 + +# Image network +model = dict( + depth_gt_downsample=8, + loss_depth_weight=1.0, + view_transform=dict( + # type="LSSTransformV2", + type="LSSTransformV2DepthAware", + xbound=[-54.0, 54.0, 0.3], + ybound=[-54.0, 54.0, 0.3], + zbound=[-10.0, 10.0, 20.0], + dbound=[1.0, 60, 0.5], + downsample=2, + camera_depth_aware_configs=dict(mlp_drop_out=0.0, downsample=8, num_camera_depth_parameters=27), + ), + bbox_head=dict( + num_proposals=num_proposals, + bbox_coder=dict( + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py similarity index 97% rename from projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py rename to projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py index c4b0cd9ab..88e74efc7 100644 --- a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py +++ b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py @@ -1,5 +1,5 @@ _base_ = [ - "./default_lidar_second_secfpn_120m.py", + "../default_lidar_second_secfpn_120m.py", ] # Image network diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py similarity index 94% rename from projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py rename to projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py index c4097de3d..2ac22b1b6 100644 --- a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py +++ b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py @@ -1,5 +1,5 @@ _base_ = [ - "./default_lidar_second_secfpn_120m.py", + "../default_lidar_second_secfpn_120m.py", ] # Image network @@ -56,5 +56,5 @@ dbound=[1.0, 130, 1.0], downsample=2, ), - fusion_layer=dict(type="ConvFuser", in_channels=[80, 256], out_channels=256, kernel_size=5, padding=2), + fusion_layer=dict(type="ConvFuser", in_channels=[80, 256], out_channels=256, kernel_size=5, stride=2, padding=2), ) diff --git a/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_50m.py b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_50m.py new file mode 100644 index 000000000..1294416ad --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_50m.py @@ -0,0 +1,23 @@ +_base_ = [ + "./default_camera_swin_fpn_depthlss_120m.py", +] + +# Image network +model = dict( + view_transform=dict( + type="LSSTransform", + in_channels=256, + out_channels=80, + feature_size=[48, 96], + xbound=[-54.0, 54.0, 0.3], + ybound=[-54.0, 54.0, 0.3], + zbound=[-10.0, 10.0, 20.0], + dbound=[1.0, 60, 0.5], + downsample=2, + ), + bbox_head=dict( + bbox_coder=dict( + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py new file mode 100644 index 000000000..3a7a428b7 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py @@ -0,0 +1,151 @@ +## This config is for the camera_base only model, without lidar points + +_base_ = [ + "../default_lidar_120m.py", +] +input_modality = dict(use_lidar=True, use_camera=True) + +# Image parameters +image_size = [384, 768] # Height, Width +camera_order = ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"] + +train_pipeline = [ + dict( + type="BEVLoadMultiViewImageFromFiles", + to_float32=True, + color_type="color", + backend_args=_base_.backend_args, + camera_order=camera_order, + ), + # We keep loading LiDAR points to make downstream BEV augmentation easier + dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=_base_.point_load_dim, + use_dim=_base_.point_load_dim, + backend_args=_base_.backend_args, + ), + dict( + type="LoadPointsFromMultiSweeps", + sweeps_num=_base_.sweeps_num, + load_dim=_base_.point_load_dim, + use_dim=_base_.lidar_sweep_dims, + pad_empty_sweeps=True, + remove_close=True, + backend_args=_base_.backend_args, + test_mode=False, + ), + dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict( + type="ImageAug3D", + final_dim=image_size, + resize_lim=[0.28, 0.40], + bot_pct_lim=[0.0, 0.0], + rot_lim=[0.0, 0.0], + rand_flip=True, + is_train=True, + ), + dict( + type="BEVFusionGlobalRotScaleTrans", + scale_ratio_range=[0.95, 1.05], + rot_range=[-0.78539816, 0.78539816], + translation_std=[0.5, 0.5, 0.2], + ), + dict(type="BEVFusionRandomFlip3D"), + dict(type="ObjectRangeFilter", point_cloud_range=_base_.point_cloud_range), + dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=5), + dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=3), + dict( + type="ObjectNameFilter", + classes=[ + "car", + "truck", + "bus", + "bicycle", + "pedestrian", + "traffic_cone", + "barrier", + ], + ), + dict( + type="Pack3DDetInputs", + keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "transformation_3d_flow", + "pcd_rotation", + "pcd_scale_factor", + "pcd_trans", + "img_aug_matrix", + "lidar_aug_matrix", + "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", + "gt_depths", + ], + ), +] + +test_pipeline = [ + dict( + type="BEVLoadMultiViewImageFromFiles", + to_float32=True, + color_type="color", + backend_args=_base_.backend_args, + camera_order=camera_order, + ), + dict( + type="LoadPointsFromMultiSweeps", + sweeps_num=_base_.sweeps_num, + load_dim=_base_.point_load_dim, + use_dim=_base_.lidar_sweep_dims, + pad_empty_sweeps=True, + remove_close=True, + backend_args=_base_.backend_args, + test_mode=True, + ), + dict( + type="ImageAug3D", + final_dim=image_size, + resize_lim=[0.34, 0.34], + bot_pct_lim=[0.0, 0.0], + rot_lim=[0.0, 0.0], + rand_flip=False, + is_train=False, + ), + dict( + type="Pack3DDetInputs", + keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "num_pts_feats", + "num_views", + "timestamp", + "vehicle_type", + "city", + ], + ), +] + +filter_cfg = dict(filter_frames_with_camera_order=camera_order) diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py new file mode 100644 index 000000000..011e460f0 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py @@ -0,0 +1,144 @@ +## This config is for the camera_base only model, without lidar points + +_base_ = [ + "../default_lidar_50m.py", +] +input_modality = dict(use_lidar=True, use_camera=True) + +# Image parameters +image_size = [384, 768] # Height, Width +camera_order = ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"] + +train_pipeline = [ + dict( + type="BEVLoadMultiViewImageFromFiles", + to_float32=True, + color_type="color", + backend_args=_base_.backend_args, + camera_order=camera_order, + ), + # We keep loading LiDAR points to make downstream BEV augmentation easier + dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=_base_.point_load_dim, + use_dim=_base_.point_load_dim, + backend_args=_base_.backend_args, + ), + dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict( + type="ImageAug3D", + final_dim=image_size, + resize_lim=[0.29, 0.35], + bot_pct_lim=[0.0, 0.0], + rot_lim=[0.0, 0.0], + rand_flip=True, + is_train=True, + ), + dict(type="PointsRangeFilter", point_cloud_range=[-80.0, -80.0, -10.0, 80.0, 80.0, 10.0]), + dict( + type="PointsToMultiViewImageDepths", + img_shape=image_size, + num_cameras=len(camera_order), + depth_bounds=[1.0, 60.0], + # visualize_dir="work_dirs/visualize_depths_6", + ), + dict( + type="BEVFusionGlobalRotScaleTrans", + scale_ratio_range=[0.95, 1.05], + rot_range=[-0.78539816, 0.78539816], + translation_std=[0.5, 0.5, 0.2], + ), + dict(type="BEVFusionRandomFlip3D"), + dict(type="ObjectRangeFilter", point_cloud_range=_base_.point_cloud_range), + dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=5), + # Remove LiDAR points from the data + dict(type="BEVFusionRemoveLiDARPoints"), + dict( + type="ObjectNameFilter", + classes=[ + "car", + "truck", + "construction_vehicle", + "bus", + "trailer", + "barrier", + "motorcycle", + "bicycle", + "pedestrian", + "traffic_cone", + ], + ), + dict( + type="Pack3DDetInputs", + keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "transformation_3d_flow", + "pcd_rotation", + "pcd_scale_factor", + "pcd_trans", + "img_aug_matrix", + "lidar_aug_matrix", + "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", + "gt_depths", + ], + ), +] + +test_pipeline = [ + dict( + type="BEVLoadMultiViewImageFromFiles", + to_float32=True, + color_type="color", + backend_args=_base_.backend_args, + camera_order=camera_order, + ), + dict( + type="ImageAug3D", + final_dim=image_size, + resize_lim=[0.32, 0.32], + bot_pct_lim=[0.0, 0.0], + rot_lim=[0.0, 0.0], + rand_flip=False, + is_train=False, + ), + dict( + type="Pack3DDetInputs", + keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "num_pts_feats", + "num_views", + "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", + ], + ), +] + +filter_cfg = dict(filter_frames_with_camera_order=camera_order) diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py index 963a218e1..11e297c09 100644 --- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py @@ -1,6 +1,6 @@ # Dataset parameters backend_args = None -num_workers = 32 +num_workers = 8 input_modality = dict(use_lidar=True, use_camera=True) # range setting @@ -13,6 +13,8 @@ "bus": 120, "bicycle": 120, "pedestrian": 120, + "traffic_cone": 120, + "barrier": 120, } # LiDAR parameters @@ -56,7 +58,7 @@ final_dim=image_size, resize_lim=[0.29, 0.35], bot_pct_lim=[0.0, 0.0], - rot_lim=[-5.4, 5.4], + rot_lim=[0.0, 0.0], rand_flip=True, is_train=True, ), @@ -69,21 +71,21 @@ dict(type="BEVFusionRandomFlip3D"), dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range), dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range), + dict(type="BEVFusionRemoveLiDARPoints"), dict( type="ObjectNameFilter", classes=[ "car", "truck", - "construction_vehicle", "bus", - "trailer", - "barrier", - "motorcycle", "bicycle", "pedestrian", "traffic_cone", + "barrier", ], ), + dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=2), + dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=1), dict(type="PointShuffle"), dict( type="Pack3DDetInputs", @@ -107,6 +109,9 @@ "img_aug_matrix", "lidar_aug_matrix", "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", ], ), ] @@ -134,7 +139,7 @@ pad_empty_sweeps=True, remove_close=True, backend_args=backend_args, - test_mode=True, + test_mode=False, ), dict( type="ImageAug3D", @@ -164,6 +169,9 @@ "num_pts_feats", "num_views", "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", ], ), ] diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py index 09b9f7b26..d797779de 100644 --- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py @@ -7,6 +7,9 @@ point_cloud_range = [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0] voxel_size = [0.17, 0.17, 0.2] grid_size = [1440, 1440, 41] +# Sparse dense output shapes +sparse_dense_output_shapes = [180, 180, 2] + eval_class_range = { "car": 120, "truck": 120, @@ -63,6 +66,8 @@ "barrier", ], ), + dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=5), + dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=3), dict(type="PointShuffle"), dict( type="Pack3DDetInputs", @@ -143,4 +148,4 @@ # e.g., dict(filter_frames_with_missing_image=True). # - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so # image-based filtering does not apply and `filter_cfg` is intentionally None. -filter_cfg = None +filter_cfg = dict() diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py new file mode 100644 index 000000000..ca75d799f --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py @@ -0,0 +1,148 @@ +# Dataset parameters +backend_args = None +num_workers = 4 +input_modality = dict(use_lidar=True, use_camera=False) + +# range setting +point_cloud_range = [-54.0, -54.0, -3.0, 54.0, 54.0, 5.0] +voxel_size = [0.075, 0.075, 0.2] +grid_size = [1440, 1440, 41] +eval_class_range = { + "car": 51.2, + "truck": 51.2, + "bus": 51.2, + "bicycle": 51.2, + "pedestrian": 51.2, + "traffic_cone": 51.2, + "barrier": 51.2, +} + +# LiDAR parameters +point_load_dim = 5 # x, y, z, intensity, ring_id +point_use_dim = 4 +lidar_sweep_dims = [0, 1, 2, 4] # x, y, z, time_lag +sweeps_num = 1 + +train_pipeline = [ + dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=point_load_dim, + use_dim=point_load_dim, + backend_args=backend_args, + ), + dict( + type="LoadPointsFromMultiSweeps", + sweeps_num=sweeps_num, + load_dim=point_load_dim, + use_dim=lidar_sweep_dims, + pad_empty_sweeps=True, + remove_close=True, + backend_args=backend_args, + test_mode=False, + ), + dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict( + type="BEVFusionGlobalRotScaleTrans", + scale_ratio_range=[0.95, 1.05], + rot_range=[-0.78539816, 0.78539816], + translation_std=[0.5, 0.5, 0.2], + ), + dict(type="BEVFusionRandomFlip3D"), + dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range), + dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range), + dict( + type="ObjectNameFilter", + classes=[ + "car", + "truck", + "bus", + "bicycle", + "pedestrian", + "traffic_cone", + "barrier", + ], + ), + dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=2), + dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=1), + dict(type="PointShuffle"), + dict( + type="Pack3DDetInputs", + keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "transformation_3d_flow", + "pcd_rotation", + "pcd_scale_factor", + "pcd_trans", + "img_aug_matrix", + "lidar_aug_matrix", + "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", + ], + ), +] + +test_pipeline = [ + dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=point_load_dim, + use_dim=point_load_dim, + backend_args=backend_args, + ), + dict( + type="LoadPointsFromMultiSweeps", + sweeps_num=sweeps_num, + load_dim=point_load_dim, + use_dim=lidar_sweep_dims, + pad_empty_sweeps=True, + remove_close=True, + backend_args=backend_args, + test_mode=True, + ), + dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range), + dict( + type="Pack3DDetInputs", + keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "num_pts_feats", + "num_views", + "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", + ], + ), +] + +# Filtering configuration +# Note: +# - In camera–LiDAR configs, `filter_cfg` can enable image-based frame filtering, +# e.g., dict(filter_frames_with_missing_image=True). +# - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so +# image-based filtering does not apply and `filter_cfg` is intentionally None. +filter_cfg = dict() diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py index e2de195e9..d5e426f58 100644 --- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py @@ -1,12 +1,15 @@ # Dataset parameters backend_args = None -num_workers = 32 +num_workers = 16 input_modality = dict(use_lidar=True, use_camera=False) # range setting point_cloud_range = [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0] voxel_size = [0.17, 0.17, 0.2] grid_size = [1440, 1440, 41] +# Sparse dense output shapes +sparse_dense_output_shapes = [180, 180, 2] + eval_class_range = { "car": 120, "truck": 120, @@ -63,6 +66,8 @@ "barrier", ], ), + dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=2), + dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=1), dict(type="PointShuffle"), dict( type="Pack3DDetInputs", @@ -143,4 +148,4 @@ # e.g., dict(filter_frames_with_missing_image=True). # - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so # image-based filtering does not apply and `filter_cfg` is intentionally None. -filter_cfg = None +filter_cfg = dict() diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py similarity index 98% rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py index 64bc2b717..05740e442 100644 --- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py +++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py @@ -1,6 +1,6 @@ # learning rate -lr = 1e-4 -t_max = 6 +lr = 2e-4 +t_max = 2 max_epochs = 20 val_interval = 1 diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_cosine.py similarity index 85% rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_cosine.py rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_cosine.py index a2cd2d2e9..2893b2e74 100644 --- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_cosine.py +++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_cosine.py @@ -1,17 +1,16 @@ # learning rate -# 1e-4 * sqrt(2) = 0.0001414 -lr = 1.4141e-4 -t_max = 8 +lr = 2.0e-4 +t_max = 1 max_epochs = 30 val_interval = 5 -train_gpu_size = 8 -test_batch_size = 2 -train_batch_size = 8 +train_gpu_size = 2 +test_batch_size = 4 +train_batch_size = 32 param_scheduler = [ # learning rate scheduler - # During the first (max_epochs * 0.4) epochs, learning rate increases from 0 to lr * 10 + # During the first (max_epochs * 0.10) epochs, learning rate increases from 0 to lr * 10 # during the next epochs, learning rate decreases from lr * 10 to # lr * 1e-4 dict( @@ -33,7 +32,7 @@ convert_to_iter_based=True, ), # momentum scheduler - # During the first (0.4 * max_epochs) epochs, momentum increases from 0 to 0.85 / 0.95 + # During the first (max_epochs * 0.10) epochs, momentum increases from 0 to 0.85 / 0.95 # during the next epochs, momentum increases from 0.85 / 0.95 to 1 dict( type="CosineAnnealingMomentum", diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_linear_cosine.py new file mode 100644 index 000000000..95f5f96bd --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb16_adamw_linear_cosine.py @@ -0,0 +1,69 @@ +# learning rate +lr = 2e-4 +t_max = 3 +max_epochs = 30 +val_interval = 1 + +train_gpu_size = 2 +test_batch_size = 2 +train_batch_size = 32 + +param_scheduler = [ + # learning rate scheduler + dict( + type="LinearLR", + start_factor=1.0 / 3, + begin=0, + end=t_max, + by_epoch=True, + convert_to_iter_based=True, + ), + dict( + type="CosineAnnealingLR", + T_max=(max_epochs - t_max), + eta_min=lr * 1e-4, + begin=t_max, + end=max_epochs, + by_epoch=True, + convert_to_iter_based=True, + ), + # momentum scheduler + # During the first (0.4 * max_epochs) epochs, momentum increases from 0 to 0.85 / 0.95 + # during the next epochs, momentum increases from 0.85 / 0.95 to 1 + dict( + type="CosineAnnealingMomentum", + T_max=t_max, + eta_min=0.85 / 0.95, + begin=0, + end=t_max, + by_epoch=True, + convert_to_iter_based=True, + ), + dict( + type="CosineAnnealingMomentum", + T_max=(max_epochs - t_max), + eta_min=1, + begin=t_max, + end=max_epochs, + by_epoch=True, + convert_to_iter_based=True, + ), +] + +train_cfg = dict( + by_epoch=True, max_epochs=max_epochs, val_interval=val_interval, dynamic_intervals=[(max_epochs - 5, 1)] +) +val_cfg = dict() +test_cfg = dict() + +optim_wrapper = dict( + type="OptimWrapper", + optimizer=dict(type="AdamW", lr=lr, weight_decay=1e-2), + clip_grad=dict(max_norm=5.0, norm_type=2), +) + +auto_scale_lr = dict(enable=False, base_batch_size=train_gpu_size * train_batch_size) + +# Only set if the number of train_gpu_size more than 1 +if train_gpu_size > 1: + sync_bn = "torch" diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb8_adamw_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_cosine.py similarity index 86% rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb8_adamw_cosine.py rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_cosine.py index 87571d0b3..d209d0c1b 100644 --- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb8_adamw_cosine.py +++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_cosine.py @@ -1,17 +1,16 @@ # learning rate -# 1e-4 * sqrt(2) = 0.0001414 -lr = 1.4141e-4 -t_max = 15 +lr = 2.0e-4 +t_max = 5 max_epochs = 50 val_interval = 5 train_gpu_size = 8 -test_batch_size = 2 -train_batch_size = 8 +test_batch_size = 4 +train_batch_size = 16 param_scheduler = [ # learning rate scheduler - # During the first (max_epochs * 0.4) epochs, learning rate increases from 0 to lr * 10 + # During the first (max_epochs * 0.10) epochs, learning rate increases from 0 to lr * 10 # during the next epochs, learning rate decreases from lr * 10 to # lr * 1e-4 dict( @@ -33,7 +32,7 @@ convert_to_iter_based=True, ), # momentum scheduler - # During the first (0.4 * max_epochs) epochs, momentum increases from 0 to 0.85 / 0.95 + # During the first (0.10 * max_epochs) epochs, momentum increases from 0 to 0.85 / 0.95 # during the next epochs, momentum increases from 0.85 / 0.95 to 1 dict( type="CosineAnnealingMomentum", diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_linear_cosine.py similarity index 96% rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_linear_cosine.py index 23d29acc1..43715fed7 100644 --- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py +++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb16_adamw_linear_cosine.py @@ -1,8 +1,8 @@ # learning rate -lr = 1e-4 -t_max = 8 -max_epochs = 30 -val_interval = 1 +lr = 2e-4 +t_max = 5 +max_epochs = 50 +val_interval = 5 train_gpu_size = 8 test_batch_size = 2 diff --git a/projects/BEVFusion/deploy/containers.py b/projects/BEVFusion/deploy/containers.py index 51f2316df..ad9243412 100644 --- a/projects/BEVFusion/deploy/containers.py +++ b/projects/BEVFusion/deploy/containers.py @@ -1,8 +1,8 @@ +# Wrapper Classes for onnx conversion +import numpy as np import torch import torch.nn.functional as F -# Wrapper Classes for onnx conversion - class TrtBevFusionImageBackboneContainer(torch.nn.Module): def __init__(self, mod, mean, std) -> None: diff --git a/projects/BEVFusion/deploy/exporter.py b/projects/BEVFusion/deploy/exporter.py index b1a430eb9..c7cd9e6b0 100644 --- a/projects/BEVFusion/deploy/exporter.py +++ b/projects/BEVFusion/deploy/exporter.py @@ -2,7 +2,7 @@ import logging import os.path as osp -from typing import Optional +from typing import Any, Optional import numpy as np import onnx @@ -11,12 +11,32 @@ from builder import ExportBuilder from containers import TrtBevFusionCameraOnlyContainer, TrtBevFusionImageBackboneContainer, TrtBevFusionMainContainer from data_classes import ModelData, SetupConfigs -from mmdeploy.core import RewriterContext +from mmdeploy.core import SYMBOLIC_REWRITER, RewriterContext from mmdeploy.utils import ( get_root_logger, ) +def purge_mmdeploy_symbolics(op_names: list[str]) -> dict: + """Delete mmdeploy's symbolic records for the given op names. + Both the op-name key (e.g. `"layer_norm"`) and the function-path + bookkeeping key (e.g. `"mmdeploy.pytorch.symbolics.layer_norm.layer_norm__default"`) + are removed. Returns a snapshot of what was deleted for optional restore. + """ + records = SYMBOLIC_REWRITER._registry._rewrite_records + removed: dict = {} + for key in list(records.keys()): + # Primary key: the aten op name itself. + if key in op_names: + removed[key] = records.pop(key) + continue + # Bookkeeping key: full Python path of an implementer function. + # Match by "...symbolics.." or "...symbolics.__" + if any(f".symbolics.{op}." in key or f".symbolics.{op}__" in key for op in op_names): + removed[key] = records.pop(key) + return removed + + class Torch2OnnxExporter: def __init__(self, setup_configs: SetupConfigs, log_level: str): @@ -62,6 +82,10 @@ def _export_model( patched_model (torch.nn.Module): Patched Pytorch model. ir_configs (dict): Configs for intermediate representations in ONNX. """ + # Purge the mmdeploy symbolic records for the layer_norm op, remove this if LayerNorm OP is not supported + # in the tensorrt version + removed = purge_mmdeploy_symbolics(["layer_norm"]) + self.logger.info(f"Purged {len(removed)} mmdeploy symbolic records: {list(removed.keys())}") with RewriterContext(**context_info), torch.no_grad(): image_feats = None if "img_backbone" in self.setup_configs.model_cfg.model: diff --git a/projects/BEVFusion/setup.py b/projects/BEVFusion/setup.py index 38f588b20..52d397c12 100644 --- a/projects/BEVFusion/setup.py +++ b/projects/BEVFusion/setup.py @@ -54,6 +54,14 @@ def make_cuda_ext(name, module, sources, sources_cuda=[], extra_args=[], extra_i "src/bev_pool_cuda.cu", ], ), + make_cuda_ext( + name="bev_pool_v2_ext", + module="projects.BEVFusion.bevfusion.ops.bev_pool_v2", + sources=[ + "src/bev_pool.cpp", + "src/bev_pool_cuda.cu", + ], + ), make_cuda_ext( name="voxel_layer", module="projects.BEVFusion.bevfusion.ops.voxel",