From 88f0e926bd9163c0e192e1e0b0bd090c13574ba8 Mon Sep 17 00:00:00 2001 From: Max-Bin Date: Mon, 1 Jun 2026 15:16:15 +0900 Subject: [PATCH 1/5] perf(BEVFusion): bit-identical GPU voxelizer ~100x faster (training) The deterministic C++ hard_voxelize is O(N^2) (point_to_voxelidx scans every earlier point) plus single-thread (determin_voxel_num is launched <<<1, 1>>>), costing ~70ms per 250k-point frame and dominating the BEVFusion training data path. Add voxelize_fast_gpu: a parallel sort + unique + scatter PyTorch path that reproduces the C++ deterministic op's output bit-for-bit (coords, num_points, per-voxel mean) at ~0.7ms (~100x). Voxelization.forward uses it on GPU in deterministic mode and falls back to the C++ op on CPU, in non-deterministic mode, or when M > max_voxels. Unlike deterministic=False (fast but not reproducible), this is both fast AND deterministic. Deployment is unaffected: Autoware uses the spconv Point2VoxelGPU hash voxelizer, not this op. test_voxelize_fast.py verifies bit-identical output on synthetic clouds including dense clusters that trigger the max_num_points truncation. Signed-off-by: Max-Bin --- .../bevfusion/ops/voxel/test_voxelize_fast.py | 100 ++++++++++++++++++ .../BEVFusion/bevfusion/ops/voxel/voxelize.py | 75 +++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py diff --git a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py new file mode 100644 index 000000000..dbc35c8a9 --- /dev/null +++ b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py @@ -0,0 +1,100 @@ +"""Verify voxelize_fast_gpu is bit-identical to the C++ hard_voxelize op. + +Run on a CUDA machine with the compiled voxel op: + + cd projects/BEVFusion && python -m bevfusion.ops.voxel.test_voxelize_fast + +Checks, on synthetic clouds (incl. dense clusters that trigger the +max_num_points truncation), that the fast path reproduces the C++ +deterministic op's coords / num_points / per-voxel mean (HardSimpleVFE input) +bit-for-bit. The voxel ROW order differs (sort-order vs first-appearance) and is +aligned away before comparison — it is irrelevant to the sparse encoder, which +indexes voxels by coords. +""" + +import time + +import torch + +from bevfusion.ops.voxel.voxelize import voxelization, voxelize_fast_gpu + + +def _flat(coors, gx, gy): + return coors[:, 2].long() * gy * gx + coors[:, 1].long() * gx + coors[:, 0].long() + + +def check(name, points, voxel_size, pcr, max_points, max_voxels): + gx = round((pcr[3] - pcr[0]) / voxel_size[0]) + gy = round((pcr[4] - pcr[1]) / voxel_size[1]) + vc, cc, nc = voxelization(points, voxel_size, pcr, max_points, max_voxels, True) + vf, cf, nf = voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels) + + oc, of = torch.argsort(_flat(cc, gx, gy)), torch.argsort(_flat(cf, gx, gy)) + coord_eq = torch.equal(_flat(cc, gx, gy)[oc], _flat(cf, gx, gy)[of]) + num_eq = torch.equal(nc[oc], nf[of]) + # per-voxel mean = HardSimpleVFE input; bit-identical iff kept point-sets match + mc = vc.sum(1) / nc.view(-1, 1).float() + mf = vf.sum(1) / nf.view(-1, 1).float() + vfe_max = (mc[oc] - mf[of]).abs().max().item() if coord_eq else float("nan") + ok = coord_eq and num_eq and vfe_max == 0.0 + + # timing: per-frame voxelize, C++ op vs fast (30 runs after warmup) + for _ in range(5): + voxelization(points, voxel_size, pcr, max_points, max_voxels, True) + voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels) + torch.cuda.synchronize() + t0 = time.perf_counter() + for _ in range(30): + voxelization(points, voxel_size, pcr, max_points, max_voxels, True) + torch.cuda.synchronize() + tc = (time.perf_counter() - t0) / 30 * 1000 + t0 = time.perf_counter() + for _ in range(30): + voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels) + torch.cuda.synchronize() + tf = (time.perf_counter() - t0) / 30 * 1000 + + print(f"{name:8s}: M={cc.shape[0]:>7d} coord_eq={coord_eq} num_eq={num_eq} " + f"vfe_maxdiff={vfe_max:.1e} | cpp={tc:.1f}ms fast={tf:.2f}ms ({tc / tf:.0f}x) " + f"{'PASS' if ok else 'FAIL'}") + return ok + + +def make_cloud(num_voxels, max_pts_per, vs, pcr, dev): + """Synthetic cloud with a CONTROLLED voxel count (< max_voxels) and up to + max_pts_per points per voxel (exercises the max_num_points truncation). + Uniform-random points would yield M ~ N voxels (> max_voxels), which is not + representative of a real LiDAR frame (M ~ 100k for a 120m sweep).""" + gx = round((pcr[3] - pcr[0]) / vs[0]) + gy = round((pcr[4] - pcr[1]) / vs[1]) + gz = round((pcr[5] - pcr[2]) / vs[2]) + vst = torch.tensor(vs, device=dev) + rmin = torch.tensor(pcr[:3], device=dev) + cells = torch.stack([ + torch.randint(0, gx, (num_voxels,), device=dev), + torch.randint(0, gy, (num_voxels,), device=dev), + torch.randint(0, gz, (num_voxels,), device=dev)], dim=1) + counts = torch.randint(1, max_pts_per + 1, (num_voxels,), device=dev) + rep = cells.repeat_interleave(counts, 0).float() + jitter = torch.rand(rep.shape[0], 3, device=dev) * 0.999 # stay inside the cell + xyz = (rep + jitter) * vst + rmin + return torch.cat([xyz, torch.rand(rep.shape[0], 2, device=dev)], dim=1) + + +def main(): + assert torch.cuda.is_available(), "needs a GPU + the compiled voxel op" + dev = "cuda" + vs = [0.17, 0.17, 0.2] + pcr = [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0] + mnp, maxv = 10, 120000 + torch.manual_seed(0) + # ~90k voxels (< max_voxels=120k, like a real 120m LiDAR frame); the dense + # case uses up to 15 points/voxel to exercise the max_num_points=10 truncation. + ok = True + ok &= check("sparse", make_cloud(90000, 3, vs, pcr, dev), vs, pcr, mnp, maxv) + ok &= check("dense", make_cloud(90000, 15, vs, pcr, dev), vs, pcr, mnp, maxv) + print("\nALL PASS" if ok else "\nFAILED") + + +if __name__ == "__main__": + main() diff --git a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py index 1a466eced..8d911ae13 100644 --- a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py +++ b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py @@ -73,6 +73,66 @@ def forward(ctx, points, voxel_size, coors_range, max_points=35, max_voxels=2000 voxelization = _Voxelization.apply +@torch.no_grad() +def voxelize_fast_gpu(points, voxel_size, point_cloud_range, max_num_points, max_voxels): + """Fast GPU hard-voxelizer, bit-identical to the C++ ``hard_voxelize`` + (``deterministic=True``) but ~100x faster. + + The C++ deterministic path is O(N^2) (``point_to_voxelidx`` scans every + earlier point) plus single-thread (``determin_voxel_num`` is launched + ``<<<1, 1>>>``), costing ~70 ms per 250k-point frame. This reproduces its + EXACT output via parallel sort + unique + scatter (~0.7 ms): + + * range filter, then ``coord = floor((xyz - min) / voxel_size)`` + * stable-sort by flat voxel id so in-voxel points keep their original + index order (this matches the C++ ``num`` = count of earlier points in + the same voxel, i.e. the first ``max_num_points`` are kept) + * ``num_points_per_voxel = min(count, max_num_points)`` + + The voxel ROW order differs (unique-sorted here vs. the C++ first-appearance + order) but is irrelevant downstream: the sparse encoder indexes voxels by + their coords. Returns ``None`` when ``M > max_voxels`` so the caller can fall + back to the C++ op, whose voxel-order-dependent truncation we do not + replicate in that (rare) case. + """ + dev = points.device + vs = torch.tensor(voxel_size, device=dev, dtype=points.dtype) + rmin = torch.tensor(point_cloud_range[:3], device=dev, dtype=points.dtype) + rmax = torch.tensor(point_cloud_range[3:], device=dev, dtype=points.dtype) + grid = torch.round((rmax - rmin) / vs).long() + gx, gy = int(grid[0]), int(grid[1]) + + in_range = ((points[:, :3] >= rmin) & (points[:, :3] < rmax)).all(dim=1) + pts = points[in_range] + feat_dim = points.shape[1] + if pts.shape[0] == 0: + return ( + points.new_zeros((0, max_num_points, feat_dim)), + points.new_zeros((0, 3), dtype=torch.int32), + points.new_zeros((0,), dtype=torch.int32), + ) + coord = torch.floor((pts[:, :3] - rmin) / vs).long() # (Nv, 3), >= 0 + flat = coord[:, 2] * (gy * gx) + coord[:, 1] * gx + coord[:, 0] + + order = torch.argsort(flat, stable=True) # stable -> in-voxel original order + flat_s, pts_s, coord_s = flat[order], pts[order], coord[order] + uniq, inv, counts = torch.unique_consecutive(flat_s, return_inverse=True, return_counts=True) + num_voxels = int(uniq.shape[0]) + if num_voxels > max_voxels: + return None # caller falls back to the C++ op + + seg_start = torch.zeros(num_voxels, dtype=torch.long, device=dev) + seg_start[1:] = torch.cumsum(counts, 0)[:-1] + rank = torch.arange(flat_s.shape[0], device=dev) - seg_start[inv] + keep = rank < max_num_points + + voxels = torch.zeros(num_voxels, max_num_points, feat_dim, device=dev, dtype=pts.dtype) + voxels[inv[keep], rank[keep]] = pts_s[keep] + num_points_per_voxel = counts.clamp(max=max_num_points).to(torch.int32) + coors = coord_s[seg_start].to(torch.int32) # (M, 3) = (x, y, z), matching the C++ op + return voxels, coors, num_points_per_voxel + + class Voxelization(nn.Module): def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels=20000, deterministic=True): @@ -126,6 +186,21 @@ def forward(self, input): else: max_voxels = self.max_voxels[1] + # Fast parallel path for the deterministic case: bit-identical to the + # C++ hard_voxelize(deterministic=True) but ~100x faster on GPU (the C++ + # path is O(N^2) + single-thread). Falls back to the C++ op on CPU, in + # non-deterministic mode, or when M > max_voxels. + if self.deterministic and input.is_cuda: + out = voxelize_fast_gpu( + input, + self.voxel_size, + self.point_cloud_range, + self.max_num_points, + max_voxels, + ) + if out is not None: + return out + return voxelization( input, self.voxel_size, From 3029147d36ad2d8b10fedba2b044078016c26415 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 Jun 2026 07:10:00 +0000 Subject: [PATCH 2/5] ci(pre-commit): autofix --- .../bevfusion/ops/voxel/test_voxelize_fast.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py index dbc35c8a9..c3db41100 100644 --- a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py +++ b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py @@ -15,7 +15,6 @@ import time import torch - from bevfusion.ops.voxel.voxelize import voxelization, voxelize_fast_gpu @@ -54,9 +53,11 @@ def check(name, points, voxel_size, pcr, max_points, max_voxels): torch.cuda.synchronize() tf = (time.perf_counter() - t0) / 30 * 1000 - print(f"{name:8s}: M={cc.shape[0]:>7d} coord_eq={coord_eq} num_eq={num_eq} " - f"vfe_maxdiff={vfe_max:.1e} | cpp={tc:.1f}ms fast={tf:.2f}ms ({tc / tf:.0f}x) " - f"{'PASS' if ok else 'FAIL'}") + print( + f"{name:8s}: M={cc.shape[0]:>7d} coord_eq={coord_eq} num_eq={num_eq} " + f"vfe_maxdiff={vfe_max:.1e} | cpp={tc:.1f}ms fast={tf:.2f}ms ({tc / tf:.0f}x) " + f"{'PASS' if ok else 'FAIL'}" + ) return ok @@ -70,10 +71,14 @@ def make_cloud(num_voxels, max_pts_per, vs, pcr, dev): gz = round((pcr[5] - pcr[2]) / vs[2]) vst = torch.tensor(vs, device=dev) rmin = torch.tensor(pcr[:3], device=dev) - cells = torch.stack([ - torch.randint(0, gx, (num_voxels,), device=dev), - torch.randint(0, gy, (num_voxels,), device=dev), - torch.randint(0, gz, (num_voxels,), device=dev)], dim=1) + cells = torch.stack( + [ + torch.randint(0, gx, (num_voxels,), device=dev), + torch.randint(0, gy, (num_voxels,), device=dev), + torch.randint(0, gz, (num_voxels,), device=dev), + ], + dim=1, + ) counts = torch.randint(1, max_pts_per + 1, (num_voxels,), device=dev) rep = cells.repeat_interleave(counts, 0).float() jitter = torch.rand(rep.shape[0], 3, device=dev) * 0.999 # stay inside the cell From 7fc9d22c99f39ffd72ba9be380c3cdcb8a0b08ec Mon Sep 17 00:00:00 2001 From: Max-Bin Date: Mon, 1 Jun 2026 16:19:03 +0900 Subject: [PATCH 3/5] fix(voxelize): drop out-of-grid voxels to match the C++ op exactly Per review (P2): the fast path used a raw range filter (>= min & < max), but the CUDA dynamic_voxelize_kernel computes the voxel index and drops points whose index is out of the rounded grid (c < 0 or c >= grid). For non-integer range/voxel_size, or points at the upper bound that floor up to c == grid, the two diverged and the fast path could emit out-of-grid voxels the encoder would reject. Now compute coord first and keep only 0 <= coord < grid, exactly like the C++ kernel. Verified bit-identical (coords / num_points / per-voxel mean) vs the C++ op on non-integer-grid + upper-edge synthetic clouds and on real frames; fast output is always within the grid. Signed-off-by: Max-Bin --- projects/BEVFusion/bevfusion/ops/voxel/voxelize.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py index 8d911ae13..0adbb3f7d 100644 --- a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py +++ b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py @@ -102,16 +102,23 @@ def voxelize_fast_gpu(points, voxel_size, point_cloud_range, max_num_points, max grid = torch.round((rmax - rmin) / vs).long() gx, gy = int(grid[0]), int(grid[1]) - in_range = ((points[:, :3] >= rmin) & (points[:, :3] < rmax)).all(dim=1) - pts = points[in_range] feat_dim = points.shape[1] + # Match the CUDA dynamic_voxelize_kernel EXACTLY: compute the voxel index for + # every point, then drop any whose index is outside the rounded grid + # (c_* < 0 or c_* >= grid_*). A raw `>= min & < max` range filter is NOT + # equivalent when range/voxel_size isn't an integer, or when a point at the + # upper bound floors up to c_* == grid_* — the C++ op discards those, so the + # fast path must too (else it emits out-of-grid voxels the encoder rejects). + coord = torch.floor((points[:, :3] - rmin) / vs).long() # (N, 3) + valid = ((coord >= 0) & (coord < grid)).all(dim=1) + pts = points[valid] + coord = coord[valid] if pts.shape[0] == 0: return ( points.new_zeros((0, max_num_points, feat_dim)), points.new_zeros((0, 3), dtype=torch.int32), points.new_zeros((0,), dtype=torch.int32), ) - coord = torch.floor((pts[:, :3] - rmin) / vs).long() # (Nv, 3), >= 0 flat = coord[:, 2] * (gy * gx) + coord[:, 1] * gx + coord[:, 0] order = torch.argsort(flat, stable=True) # stable -> in-voxel original order From b84a3e15fa70c5fa113a73aa79a2023b33de4353 Mon Sep 17 00:00:00 2001 From: Max-Bin Date: Tue, 2 Jun 2026 10:14:28 +0900 Subject: [PATCH 4/5] perf(voxelize): drop per-forward .item() sync; strengthen equivalence test Address review feedback on the fast GPU voxelizer: - voxelize_fast_gpu: compute the grid dims on the host from the config instead of via a CUDA tensor, removing the per-forward device->host .item() sync on this hot path (Copilot). floor(x+0.5) mirrors the C++ round() exactly; grid values verified identical to the old computation across configs, and the op stays bit-identical on 8 real frames (coords / num_points / full voxel tensors, max_abs_diff = 0). - Comments: clarify that the z-major linear key is an internal sort/group key only and the output coords are (x, y, z) (matching this repo's compiled hard_voxelize), and why M > max_voxels falls back to the C++ op rather than truncating here (preserves bit-identity) (KSeangTan). - test: convert to unittest (auto-skips without CUDA) and compare the full per-voxel tensors instead of only per-voxel means, so kept point-set and slot-order differences are caught, not just permutation-invariant means (Copilot). Signed-off-by: Max-Bin --- .../bevfusion/ops/voxel/test_voxelize_fast.py | 121 +++++++++++------- .../BEVFusion/bevfusion/ops/voxel/voxelize.py | 32 ++++- 2 files changed, 99 insertions(+), 54 deletions(-) diff --git a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py index c3db41100..dfd1af59a 100644 --- a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py +++ b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py @@ -1,64 +1,53 @@ """Verify voxelize_fast_gpu is bit-identical to the C++ hard_voxelize op. -Run on a CUDA machine with the compiled voxel op: +Run as a unittest (skips when no CUDA / compiled op is available): + + cd projects/BEVFusion && python -m unittest bevfusion.ops.voxel.test_voxelize_fast + +or directly, which also prints the speed benchmark: cd projects/BEVFusion && python -m bevfusion.ops.voxel.test_voxelize_fast -Checks, on synthetic clouds (incl. dense clusters that trigger the -max_num_points truncation), that the fast path reproduces the C++ -deterministic op's coords / num_points / per-voxel mean (HardSimpleVFE input) -bit-for-bit. The voxel ROW order differs (sort-order vs first-appearance) and is -aligned away before comparison — it is irrelevant to the sparse encoder, which -indexes voxels by coords. +On synthetic clouds (incl. dense clusters that trigger the max_num_points +truncation) it checks that the fast path reproduces the C++ deterministic op's +coords / num_points / and the FULL per-voxel point tensors bit-for-bit. The +voxel ROW order differs (sort-order vs first-appearance) and is aligned away by +sorting both on the voxel id before comparison — it is irrelevant to the sparse +encoder, which indexes voxels by coords. """ import time +import unittest import torch + from bevfusion.ops.voxel.voxelize import voxelization, voxelize_fast_gpu +VS = [0.17, 0.17, 0.2] +PCR = [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0] +MNP, MAXV = 10, 120000 + def _flat(coors, gx, gy): return coors[:, 2].long() * gy * gx + coors[:, 1].long() * gx + coors[:, 0].long() -def check(name, points, voxel_size, pcr, max_points, max_voxels): +def compare(points, voxel_size, pcr, max_points, max_voxels): + """Return (coord_eq, num_eq, vox_max_abs_diff) of fast vs C++ op.""" gx = round((pcr[3] - pcr[0]) / voxel_size[0]) gy = round((pcr[4] - pcr[1]) / voxel_size[1]) vc, cc, nc = voxelization(points, voxel_size, pcr, max_points, max_voxels, True) vf, cf, nf = voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels) + # Align the two voxel orderings (sort-order vs first-appearance) by voxel id. oc, of = torch.argsort(_flat(cc, gx, gy)), torch.argsort(_flat(cf, gx, gy)) coord_eq = torch.equal(_flat(cc, gx, gy)[oc], _flat(cf, gx, gy)[of]) num_eq = torch.equal(nc[oc], nf[of]) - # per-voxel mean = HardSimpleVFE input; bit-identical iff kept point-sets match - mc = vc.sum(1) / nc.view(-1, 1).float() - mf = vf.sum(1) / nf.view(-1, 1).float() - vfe_max = (mc[oc] - mf[of]).abs().max().item() if coord_eq else float("nan") - ok = coord_eq and num_eq and vfe_max == 0.0 - - # timing: per-frame voxelize, C++ op vs fast (30 runs after warmup) - for _ in range(5): - voxelization(points, voxel_size, pcr, max_points, max_voxels, True) - voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels) - torch.cuda.synchronize() - t0 = time.perf_counter() - for _ in range(30): - voxelization(points, voxel_size, pcr, max_points, max_voxels, True) - torch.cuda.synchronize() - tc = (time.perf_counter() - t0) / 30 * 1000 - t0 = time.perf_counter() - for _ in range(30): - voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels) - torch.cuda.synchronize() - tf = (time.perf_counter() - t0) / 30 * 1000 - - print( - f"{name:8s}: M={cc.shape[0]:>7d} coord_eq={coord_eq} num_eq={num_eq} " - f"vfe_maxdiff={vfe_max:.1e} | cpp={tc:.1f}ms fast={tf:.2f}ms ({tc / tf:.0f}x) " - f"{'PASS' if ok else 'FAIL'}" - ) - return ok + # Stronger than per-voxel means (which are permutation-invariant): compare + # the FULL aligned voxel tensors. Both ops fill slots in original point + # order, so the kept point sets AND their slot order must match exactly. + vox_max = (vc[oc] - vf[of]).abs().max().item() if coord_eq else float("nan") + return coord_eq, num_eq, vox_max def make_cloud(num_voxels, max_pts_per, vs, pcr, dev): @@ -86,20 +75,56 @@ def make_cloud(num_voxels, max_pts_per, vs, pcr, dev): return torch.cat([xyz, torch.rand(rep.shape[0], 2, device=dev)], dim=1) -def main(): +@unittest.skipUnless(torch.cuda.is_available(), "needs a CUDA GPU + the compiled voxel op") +class TestVoxelizeFast(unittest.TestCase): + """voxelize_fast_gpu must be bit-identical to the deterministic C++ op.""" + + def _check(self, num_voxels, max_pts_per): + torch.manual_seed(0) + pts = make_cloud(num_voxels, max_pts_per, VS, PCR, "cuda") + coord_eq, num_eq, vox_max = compare(pts, VS, PCR, MNP, MAXV) + self.assertTrue(coord_eq, "voxel coords differ from the C++ op") + self.assertTrue(num_eq, "num_points_per_voxel differs from the C++ op") + self.assertEqual(vox_max, 0.0, f"voxel features differ (max_abs_diff={vox_max})") + + def test_sparse(self): + # ~90k voxels (< max_voxels), <= 3 pts/voxel: no max_num_points truncation. + self._check(90000, 3) + + def test_dense(self): + # up to 15 pts/voxel exercises the max_num_points=10 truncation (which + # points are kept must match the C++ op exactly). + self._check(90000, 15) + + +def _benchmark(): + """Print equivalence + per-frame voxelize timing (C++ op vs fast path).""" assert torch.cuda.is_available(), "needs a GPU + the compiled voxel op" dev = "cuda" - vs = [0.17, 0.17, 0.2] - pcr = [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0] - mnp, maxv = 10, 120000 - torch.manual_seed(0) - # ~90k voxels (< max_voxels=120k, like a real 120m LiDAR frame); the dense - # case uses up to 15 points/voxel to exercise the max_num_points=10 truncation. - ok = True - ok &= check("sparse", make_cloud(90000, 3, vs, pcr, dev), vs, pcr, mnp, maxv) - ok &= check("dense", make_cloud(90000, 15, vs, pcr, dev), vs, pcr, mnp, maxv) - print("\nALL PASS" if ok else "\nFAILED") + for name, mpp in (("sparse", 3), ("dense", 15)): + torch.manual_seed(0) + pts = make_cloud(90000, mpp, VS, PCR, dev) + coord_eq, num_eq, vox_max = compare(pts, VS, PCR, MNP, MAXV) + for _ in range(5): + voxelization(pts, VS, PCR, MNP, MAXV, True) + voxelize_fast_gpu(pts, VS, PCR, MNP, MAXV) + torch.cuda.synchronize() + t0 = time.perf_counter() + for _ in range(30): + voxelization(pts, VS, PCR, MNP, MAXV, True) + torch.cuda.synchronize() + tc = (time.perf_counter() - t0) / 30 * 1000 + t0 = time.perf_counter() + for _ in range(30): + voxelize_fast_gpu(pts, VS, PCR, MNP, MAXV) + torch.cuda.synchronize() + tf = (time.perf_counter() - t0) / 30 * 1000 + ok = coord_eq and num_eq and vox_max == 0.0 + print( + f"{name:8s}: coord_eq={coord_eq} num_eq={num_eq} vox_maxdiff={vox_max:.1e} | " + f"cpp={tc:.1f}ms fast={tf:.2f}ms ({tc / tf:.0f}x) {'PASS' if ok else 'FAIL'}" + ) if __name__ == "__main__": - main() + _benchmark() diff --git a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py index 0adbb3f7d..fbee72768 100644 --- a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py +++ b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py @@ -1,4 +1,6 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import math + import torch from torch import nn from torch.autograd import Function @@ -96,11 +98,16 @@ def voxelize_fast_gpu(points, voxel_size, point_cloud_range, max_num_points, max replicate in that (rare) case. """ dev = points.device + # Grid dims from the python config on the HOST — never a CUDA tensor — so + # there is no per-forward device->host `.item()` sync (this path is hot). + # `floor(x + 0.5)` mirrors the C++ op's `round()` exactly + # (voxelization_cuda.cu: `grid_* = round((max - min) / voxel_*)`). + gx = int(math.floor((point_cloud_range[3] - point_cloud_range[0]) / voxel_size[0] + 0.5)) + gy = int(math.floor((point_cloud_range[4] - point_cloud_range[1]) / voxel_size[1] + 0.5)) + gz = int(math.floor((point_cloud_range[5] - point_cloud_range[2]) / voxel_size[2] + 0.5)) vs = torch.tensor(voxel_size, device=dev, dtype=points.dtype) rmin = torch.tensor(point_cloud_range[:3], device=dev, dtype=points.dtype) - rmax = torch.tensor(point_cloud_range[3:], device=dev, dtype=points.dtype) - grid = torch.round((rmax - rmin) / vs).long() - gx, gy = int(grid[0]), int(grid[1]) + grid = torch.tensor([gx, gy, gz], device=dev) # small constant, no .item() sync feat_dim = points.shape[1] # Match the CUDA dynamic_voxelize_kernel EXACTLY: compute the voxel index for @@ -119,14 +126,23 @@ def voxelize_fast_gpu(points, voxel_size, point_cloud_range, max_num_points, max points.new_zeros((0, 3), dtype=torch.int32), points.new_zeros((0,), dtype=torch.int32), ) + # Internal grouping key only: linearize the (x, y, z) voxel index with z as + # the most-significant axis so a stable sort groups same-voxel points and + # preserves their original order. This z-major key does NOT change the + # output coord order (`coors` below is (x, y, z)). flat = coord[:, 2] * (gy * gx) + coord[:, 1] * gx + coord[:, 0] order = torch.argsort(flat, stable=True) # stable -> in-voxel original order flat_s, pts_s, coord_s = flat[order], pts[order], coord[order] uniq, inv, counts = torch.unique_consecutive(flat_s, return_inverse=True, return_counts=True) - num_voxels = int(uniq.shape[0]) + num_voxels = int(uniq.shape[0]) # tensor.shape is a host int — no device sync if num_voxels > max_voxels: - return None # caller falls back to the C++ op + # Bail to the C++ op (don't just keep the first max_voxels here): the C++ + # truncation when M > max_voxels drops voxels by FIRST-APPEARANCE order, + # which the sort-order fast path doesn't reproduce — keeping max_voxels + # here would retain a different voxel set and break bit-identity. This + # branch is rare in practice (M ~ 100k < max_voxels = 120k for a 120m sweep). + return None seg_start = torch.zeros(num_voxels, dtype=torch.long, device=dev) seg_start[1:] = torch.cumsum(counts, 0)[:-1] @@ -136,7 +152,11 @@ def voxelize_fast_gpu(points, voxel_size, point_cloud_range, max_num_points, max voxels = torch.zeros(num_voxels, max_num_points, feat_dim, device=dev, dtype=pts.dtype) voxels[inv[keep], rank[keep]] = pts_s[keep] num_points_per_voxel = counts.clamp(max=max_num_points).to(torch.int32) - coors = coord_s[seg_start].to(torch.int32) # (M, 3) = (x, y, z), matching the C++ op + # (x, y, z) order — matches this repo's compiled hard_voxelize, whose + # dynamic_voxelize_kernel writes coors_offset[0..2] = c_x, c_y, c_z + # (voxelization_cuda.cu). Any (z, y, x) flip for the sparse encoder happens + # downstream and applies equally to this and the C++ op's output. + coors = coord_s[seg_start].to(torch.int32) # (M, 3) = (x, y, z) return voxels, coors, num_points_per_voxel From d93cfe48f7d66e6b2385b98788aaa07af2ed4833 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 01:15:26 +0000 Subject: [PATCH 5/5] ci(pre-commit): autofix --- projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py | 1 - 1 file changed, 1 deletion(-) diff --git a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py index dfd1af59a..fe4233701 100644 --- a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py +++ b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py @@ -20,7 +20,6 @@ import unittest import torch - from bevfusion.ops.voxel.voxelize import voxelization, voxelize_fast_gpu VS = [0.17, 0.17, 0.2]