From 88f0e926bd9163c0e192e1e0b0bd090c13574ba8 Mon Sep 17 00:00:00 2001
From: Max-Bin <vborisw@gmail.com>
Date: Mon, 1 Jun 2026 15:16:15 +0900
Subject: [PATCH 1/5] perf(BEVFusion): bit-identical GPU voxelizer ~100x faster
 (training)

The deterministic C++ hard_voxelize is O(N^2) (point_to_voxelidx scans every
earlier point) plus single-thread (determin_voxel_num is launched <<<1, 1>>>),
costing ~70ms per 250k-point frame and dominating the BEVFusion training data
path.

Add voxelize_fast_gpu: a parallel sort + unique + scatter PyTorch path that
reproduces the C++ deterministic op's output bit-for-bit (coords, num_points,
per-voxel mean) at ~0.7ms (~100x). Voxelization.forward uses it on GPU in
deterministic mode and falls back to the C++ op on CPU, in non-deterministic
mode, or when M > max_voxels. Unlike deterministic=False (fast but not
reproducible), this is both fast AND deterministic.

Deployment is unaffected: Autoware uses the spconv Point2VoxelGPU hash
voxelizer, not this op. test_voxelize_fast.py verifies bit-identical output on
synthetic clouds including dense clusters that trigger the max_num_points
truncation.

Signed-off-by: Max-Bin <vborisw@gmail.com>
---
 .../bevfusion/ops/voxel/test_voxelize_fast.py | 100 ++++++++++++++++++
 .../BEVFusion/bevfusion/ops/voxel/voxelize.py |  75 +++++++++++++
 2 files changed, 175 insertions(+)
 create mode 100644 projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py

diff --git a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py
new file mode 100644
index 000000000..dbc35c8a9
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py
@@ -0,0 +1,100 @@
+"""Verify voxelize_fast_gpu is bit-identical to the C++ hard_voxelize op.
+
+Run on a CUDA machine with the compiled voxel op:
+
+    cd projects/BEVFusion && python -m bevfusion.ops.voxel.test_voxelize_fast
+
+Checks, on synthetic clouds (incl. dense clusters that trigger the
+max_num_points truncation), that the fast path reproduces the C++
+deterministic op's coords / num_points / per-voxel mean (HardSimpleVFE input)
+bit-for-bit. The voxel ROW order differs (sort-order vs first-appearance) and is
+aligned away before comparison — it is irrelevant to the sparse encoder, which
+indexes voxels by coords.
+"""
+
+import time
+
+import torch
+
+from bevfusion.ops.voxel.voxelize import voxelization, voxelize_fast_gpu
+
+
+def _flat(coors, gx, gy):
+    return coors[:, 2].long() * gy * gx + coors[:, 1].long() * gx + coors[:, 0].long()
+
+
+def check(name, points, voxel_size, pcr, max_points, max_voxels):
+    gx = round((pcr[3] - pcr[0]) / voxel_size[0])
+    gy = round((pcr[4] - pcr[1]) / voxel_size[1])
+    vc, cc, nc = voxelization(points, voxel_size, pcr, max_points, max_voxels, True)
+    vf, cf, nf = voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels)
+
+    oc, of = torch.argsort(_flat(cc, gx, gy)), torch.argsort(_flat(cf, gx, gy))
+    coord_eq = torch.equal(_flat(cc, gx, gy)[oc], _flat(cf, gx, gy)[of])
+    num_eq = torch.equal(nc[oc], nf[of])
+    # per-voxel mean = HardSimpleVFE input; bit-identical iff kept point-sets match
+    mc = vc.sum(1) / nc.view(-1, 1).float()
+    mf = vf.sum(1) / nf.view(-1, 1).float()
+    vfe_max = (mc[oc] - mf[of]).abs().max().item() if coord_eq else float("nan")
+    ok = coord_eq and num_eq and vfe_max == 0.0
+
+    # timing: per-frame voxelize, C++ op vs fast (30 runs after warmup)
+    for _ in range(5):
+        voxelization(points, voxel_size, pcr, max_points, max_voxels, True)
+        voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels)
+    torch.cuda.synchronize()
+    t0 = time.perf_counter()
+    for _ in range(30):
+        voxelization(points, voxel_size, pcr, max_points, max_voxels, True)
+    torch.cuda.synchronize()
+    tc = (time.perf_counter() - t0) / 30 * 1000
+    t0 = time.perf_counter()
+    for _ in range(30):
+        voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels)
+    torch.cuda.synchronize()
+    tf = (time.perf_counter() - t0) / 30 * 1000
+
+    print(f"{name:8s}: M={cc.shape[0]:>7d} coord_eq={coord_eq} num_eq={num_eq} "
+          f"vfe_maxdiff={vfe_max:.1e} | cpp={tc:.1f}ms fast={tf:.2f}ms ({tc / tf:.0f}x)  "
+          f"{'PASS' if ok else 'FAIL'}")
+    return ok
+
+
+def make_cloud(num_voxels, max_pts_per, vs, pcr, dev):
+    """Synthetic cloud with a CONTROLLED voxel count (< max_voxels) and up to
+    max_pts_per points per voxel (exercises the max_num_points truncation).
+    Uniform-random points would yield M ~ N voxels (> max_voxels), which is not
+    representative of a real LiDAR frame (M ~ 100k for a 120m sweep)."""
+    gx = round((pcr[3] - pcr[0]) / vs[0])
+    gy = round((pcr[4] - pcr[1]) / vs[1])
+    gz = round((pcr[5] - pcr[2]) / vs[2])
+    vst = torch.tensor(vs, device=dev)
+    rmin = torch.tensor(pcr[:3], device=dev)
+    cells = torch.stack([
+        torch.randint(0, gx, (num_voxels,), device=dev),
+        torch.randint(0, gy, (num_voxels,), device=dev),
+        torch.randint(0, gz, (num_voxels,), device=dev)], dim=1)
+    counts = torch.randint(1, max_pts_per + 1, (num_voxels,), device=dev)
+    rep = cells.repeat_interleave(counts, 0).float()
+    jitter = torch.rand(rep.shape[0], 3, device=dev) * 0.999  # stay inside the cell
+    xyz = (rep + jitter) * vst + rmin
+    return torch.cat([xyz, torch.rand(rep.shape[0], 2, device=dev)], dim=1)
+
+
+def main():
+    assert torch.cuda.is_available(), "needs a GPU + the compiled voxel op"
+    dev = "cuda"
+    vs = [0.17, 0.17, 0.2]
+    pcr = [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0]
+    mnp, maxv = 10, 120000
+    torch.manual_seed(0)
+    # ~90k voxels (< max_voxels=120k, like a real 120m LiDAR frame); the dense
+    # case uses up to 15 points/voxel to exercise the max_num_points=10 truncation.
+    ok = True
+    ok &= check("sparse", make_cloud(90000, 3, vs, pcr, dev), vs, pcr, mnp, maxv)
+    ok &= check("dense", make_cloud(90000, 15, vs, pcr, dev), vs, pcr, mnp, maxv)
+    print("\nALL PASS" if ok else "\nFAILED")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py
index 1a466eced..8d911ae13 100644
--- a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py
+++ b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py
@@ -73,6 +73,66 @@ def forward(ctx, points, voxel_size, coors_range, max_points=35, max_voxels=2000
 voxelization = _Voxelization.apply
 
 
+@torch.no_grad()
+def voxelize_fast_gpu(points, voxel_size, point_cloud_range, max_num_points, max_voxels):
+    """Fast GPU hard-voxelizer, bit-identical to the C++ ``hard_voxelize``
+    (``deterministic=True``) but ~100x faster.
+
+    The C++ deterministic path is O(N^2) (``point_to_voxelidx`` scans every
+    earlier point) plus single-thread (``determin_voxel_num`` is launched
+    ``<<<1, 1>>>``), costing ~70 ms per 250k-point frame. This reproduces its
+    EXACT output via parallel sort + unique + scatter (~0.7 ms):
+
+      * range filter, then ``coord = floor((xyz - min) / voxel_size)``
+      * stable-sort by flat voxel id so in-voxel points keep their original
+        index order (this matches the C++ ``num`` = count of earlier points in
+        the same voxel, i.e. the first ``max_num_points`` are kept)
+      * ``num_points_per_voxel = min(count, max_num_points)``
+
+    The voxel ROW order differs (unique-sorted here vs. the C++ first-appearance
+    order) but is irrelevant downstream: the sparse encoder indexes voxels by
+    their coords. Returns ``None`` when ``M > max_voxels`` so the caller can fall
+    back to the C++ op, whose voxel-order-dependent truncation we do not
+    replicate in that (rare) case.
+    """
+    dev = points.device
+    vs = torch.tensor(voxel_size, device=dev, dtype=points.dtype)
+    rmin = torch.tensor(point_cloud_range[:3], device=dev, dtype=points.dtype)
+    rmax = torch.tensor(point_cloud_range[3:], device=dev, dtype=points.dtype)
+    grid = torch.round((rmax - rmin) / vs).long()
+    gx, gy = int(grid[0]), int(grid[1])
+
+    in_range = ((points[:, :3] >= rmin) & (points[:, :3] < rmax)).all(dim=1)
+    pts = points[in_range]
+    feat_dim = points.shape[1]
+    if pts.shape[0] == 0:
+        return (
+            points.new_zeros((0, max_num_points, feat_dim)),
+            points.new_zeros((0, 3), dtype=torch.int32),
+            points.new_zeros((0,), dtype=torch.int32),
+        )
+    coord = torch.floor((pts[:, :3] - rmin) / vs).long()  # (Nv, 3), >= 0
+    flat = coord[:, 2] * (gy * gx) + coord[:, 1] * gx + coord[:, 0]
+
+    order = torch.argsort(flat, stable=True)  # stable -> in-voxel original order
+    flat_s, pts_s, coord_s = flat[order], pts[order], coord[order]
+    uniq, inv, counts = torch.unique_consecutive(flat_s, return_inverse=True, return_counts=True)
+    num_voxels = int(uniq.shape[0])
+    if num_voxels > max_voxels:
+        return None  # caller falls back to the C++ op
+
+    seg_start = torch.zeros(num_voxels, dtype=torch.long, device=dev)
+    seg_start[1:] = torch.cumsum(counts, 0)[:-1]
+    rank = torch.arange(flat_s.shape[0], device=dev) - seg_start[inv]
+    keep = rank < max_num_points
+
+    voxels = torch.zeros(num_voxels, max_num_points, feat_dim, device=dev, dtype=pts.dtype)
+    voxels[inv[keep], rank[keep]] = pts_s[keep]
+    num_points_per_voxel = counts.clamp(max=max_num_points).to(torch.int32)
+    coors = coord_s[seg_start].to(torch.int32)  # (M, 3) = (x, y, z), matching the C++ op
+    return voxels, coors, num_points_per_voxel
+
+
 class Voxelization(nn.Module):
 
     def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels=20000, deterministic=True):
@@ -126,6 +186,21 @@ def forward(self, input):
         else:
             max_voxels = self.max_voxels[1]
 
+        # Fast parallel path for the deterministic case: bit-identical to the
+        # C++ hard_voxelize(deterministic=True) but ~100x faster on GPU (the C++
+        # path is O(N^2) + single-thread). Falls back to the C++ op on CPU, in
+        # non-deterministic mode, or when M > max_voxels.
+        if self.deterministic and input.is_cuda:
+            out = voxelize_fast_gpu(
+                input,
+                self.voxel_size,
+                self.point_cloud_range,
+                self.max_num_points,
+                max_voxels,
+            )
+            if out is not None:
+                return out
+
         return voxelization(
             input,
             self.voxel_size,

From 3029147d36ad2d8b10fedba2b044078016c26415 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Jun 2026 07:10:00 +0000
Subject: [PATCH 2/5] ci(pre-commit): autofix

---
 .../bevfusion/ops/voxel/test_voxelize_fast.py | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py
index dbc35c8a9..c3db41100 100644
--- a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py
+++ b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py
@@ -15,7 +15,6 @@
 import time
 
 import torch
-
 from bevfusion.ops.voxel.voxelize import voxelization, voxelize_fast_gpu
 
 
@@ -54,9 +53,11 @@ def check(name, points, voxel_size, pcr, max_points, max_voxels):
     torch.cuda.synchronize()
     tf = (time.perf_counter() - t0) / 30 * 1000
 
-    print(f"{name:8s}: M={cc.shape[0]:>7d} coord_eq={coord_eq} num_eq={num_eq} "
-          f"vfe_maxdiff={vfe_max:.1e} | cpp={tc:.1f}ms fast={tf:.2f}ms ({tc / tf:.0f}x)  "
-          f"{'PASS' if ok else 'FAIL'}")
+    print(
+        f"{name:8s}: M={cc.shape[0]:>7d} coord_eq={coord_eq} num_eq={num_eq} "
+        f"vfe_maxdiff={vfe_max:.1e} | cpp={tc:.1f}ms fast={tf:.2f}ms ({tc / tf:.0f}x)  "
+        f"{'PASS' if ok else 'FAIL'}"
+    )
     return ok
 
 
@@ -70,10 +71,14 @@ def make_cloud(num_voxels, max_pts_per, vs, pcr, dev):
     gz = round((pcr[5] - pcr[2]) / vs[2])
     vst = torch.tensor(vs, device=dev)
     rmin = torch.tensor(pcr[:3], device=dev)
-    cells = torch.stack([
-        torch.randint(0, gx, (num_voxels,), device=dev),
-        torch.randint(0, gy, (num_voxels,), device=dev),
-        torch.randint(0, gz, (num_voxels,), device=dev)], dim=1)
+    cells = torch.stack(
+        [
+            torch.randint(0, gx, (num_voxels,), device=dev),
+            torch.randint(0, gy, (num_voxels,), device=dev),
+            torch.randint(0, gz, (num_voxels,), device=dev),
+        ],
+        dim=1,
+    )
     counts = torch.randint(1, max_pts_per + 1, (num_voxels,), device=dev)
     rep = cells.repeat_interleave(counts, 0).float()
     jitter = torch.rand(rep.shape[0], 3, device=dev) * 0.999  # stay inside the cell

From 7fc9d22c99f39ffd72ba9be380c3cdcb8a0b08ec Mon Sep 17 00:00:00 2001
From: Max-Bin <vborisw@gmail.com>
Date: Mon, 1 Jun 2026 16:19:03 +0900
Subject: [PATCH 3/5] fix(voxelize): drop out-of-grid voxels to match the C++
 op exactly

Per review (P2): the fast path used a raw range filter (>= min & < max), but the
CUDA dynamic_voxelize_kernel computes the voxel index and drops points whose
index is out of the rounded grid (c < 0 or c >= grid). For non-integer
range/voxel_size, or points at the upper bound that floor up to c == grid, the
two diverged and the fast path could emit out-of-grid voxels the encoder would
reject. Now compute coord first and keep only 0 <= coord < grid, exactly like
the C++ kernel.

Verified bit-identical (coords / num_points / per-voxel mean) vs the C++ op on
non-integer-grid + upper-edge synthetic clouds and on real frames; fast output
is always within the grid.

Signed-off-by: Max-Bin <vborisw@gmail.com>
---
 projects/BEVFusion/bevfusion/ops/voxel/voxelize.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py
index 8d911ae13..0adbb3f7d 100644
--- a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py
+++ b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py
@@ -102,16 +102,23 @@ def voxelize_fast_gpu(points, voxel_size, point_cloud_range, max_num_points, max
     grid = torch.round((rmax - rmin) / vs).long()
     gx, gy = int(grid[0]), int(grid[1])
 
-    in_range = ((points[:, :3] >= rmin) & (points[:, :3] < rmax)).all(dim=1)
-    pts = points[in_range]
     feat_dim = points.shape[1]
+    # Match the CUDA dynamic_voxelize_kernel EXACTLY: compute the voxel index for
+    # every point, then drop any whose index is outside the rounded grid
+    # (c_* < 0 or c_* >= grid_*). A raw `>= min & < max` range filter is NOT
+    # equivalent when range/voxel_size isn't an integer, or when a point at the
+    # upper bound floors up to c_* == grid_* — the C++ op discards those, so the
+    # fast path must too (else it emits out-of-grid voxels the encoder rejects).
+    coord = torch.floor((points[:, :3] - rmin) / vs).long()  # (N, 3)
+    valid = ((coord >= 0) & (coord < grid)).all(dim=1)
+    pts = points[valid]
+    coord = coord[valid]
     if pts.shape[0] == 0:
         return (
             points.new_zeros((0, max_num_points, feat_dim)),
             points.new_zeros((0, 3), dtype=torch.int32),
             points.new_zeros((0,), dtype=torch.int32),
         )
-    coord = torch.floor((pts[:, :3] - rmin) / vs).long()  # (Nv, 3), >= 0
     flat = coord[:, 2] * (gy * gx) + coord[:, 1] * gx + coord[:, 0]
 
     order = torch.argsort(flat, stable=True)  # stable -> in-voxel original order

From b84a3e15fa70c5fa113a73aa79a2023b33de4353 Mon Sep 17 00:00:00 2001
From: Max-Bin <vborisw@gmail.com>
Date: Tue, 2 Jun 2026 10:14:28 +0900
Subject: [PATCH 4/5] perf(voxelize): drop per-forward .item() sync; strengthen
 equivalence test

Address review feedback on the fast GPU voxelizer:

- voxelize_fast_gpu: compute the grid dims on the host from the config
  instead of via a CUDA tensor, removing the per-forward device->host
  .item() sync on this hot path (Copilot). floor(x+0.5) mirrors the C++
  round() exactly; grid values verified identical to the old computation
  across configs, and the op stays bit-identical on 8 real frames
  (coords / num_points / full voxel tensors, max_abs_diff = 0).
- Comments: clarify that the z-major linear key is an internal sort/group
  key only and the output coords are (x, y, z) (matching this repo's
  compiled hard_voxelize), and why M > max_voxels falls back to the C++ op
  rather than truncating here (preserves bit-identity) (KSeangTan).
- test: convert to unittest (auto-skips without CUDA) and compare the full
  per-voxel tensors instead of only per-voxel means, so kept point-set and
  slot-order differences are caught, not just permutation-invariant means
  (Copilot).

Signed-off-by: Max-Bin <vborisw@gmail.com>
---
 .../bevfusion/ops/voxel/test_voxelize_fast.py | 121 +++++++++++-------
 .../BEVFusion/bevfusion/ops/voxel/voxelize.py |  32 ++++-
 2 files changed, 99 insertions(+), 54 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py
index c3db41100..dfd1af59a 100644
--- a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py
+++ b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py
@@ -1,64 +1,53 @@
 """Verify voxelize_fast_gpu is bit-identical to the C++ hard_voxelize op.
 
-Run on a CUDA machine with the compiled voxel op:
+Run as a unittest (skips when no CUDA / compiled op is available):
+
+    cd projects/BEVFusion && python -m unittest bevfusion.ops.voxel.test_voxelize_fast
+
+or directly, which also prints the speed benchmark:
 
     cd projects/BEVFusion && python -m bevfusion.ops.voxel.test_voxelize_fast
 
-Checks, on synthetic clouds (incl. dense clusters that trigger the
-max_num_points truncation), that the fast path reproduces the C++
-deterministic op's coords / num_points / per-voxel mean (HardSimpleVFE input)
-bit-for-bit. The voxel ROW order differs (sort-order vs first-appearance) and is
-aligned away before comparison — it is irrelevant to the sparse encoder, which
-indexes voxels by coords.
+On synthetic clouds (incl. dense clusters that trigger the max_num_points
+truncation) it checks that the fast path reproduces the C++ deterministic op's
+coords / num_points / and the FULL per-voxel point tensors bit-for-bit. The
+voxel ROW order differs (sort-order vs first-appearance) and is aligned away by
+sorting both on the voxel id before comparison — it is irrelevant to the sparse
+encoder, which indexes voxels by coords.
 """
 
 import time
+import unittest
 
 import torch
+
 from bevfusion.ops.voxel.voxelize import voxelization, voxelize_fast_gpu
 
+VS = [0.17, 0.17, 0.2]
+PCR = [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0]
+MNP, MAXV = 10, 120000
+
 
 def _flat(coors, gx, gy):
     return coors[:, 2].long() * gy * gx + coors[:, 1].long() * gx + coors[:, 0].long()
 
 
-def check(name, points, voxel_size, pcr, max_points, max_voxels):
+def compare(points, voxel_size, pcr, max_points, max_voxels):
+    """Return (coord_eq, num_eq, vox_max_abs_diff) of fast vs C++ op."""
     gx = round((pcr[3] - pcr[0]) / voxel_size[0])
     gy = round((pcr[4] - pcr[1]) / voxel_size[1])
     vc, cc, nc = voxelization(points, voxel_size, pcr, max_points, max_voxels, True)
     vf, cf, nf = voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels)
 
+    # Align the two voxel orderings (sort-order vs first-appearance) by voxel id.
     oc, of = torch.argsort(_flat(cc, gx, gy)), torch.argsort(_flat(cf, gx, gy))
     coord_eq = torch.equal(_flat(cc, gx, gy)[oc], _flat(cf, gx, gy)[of])
     num_eq = torch.equal(nc[oc], nf[of])
-    # per-voxel mean = HardSimpleVFE input; bit-identical iff kept point-sets match
-    mc = vc.sum(1) / nc.view(-1, 1).float()
-    mf = vf.sum(1) / nf.view(-1, 1).float()
-    vfe_max = (mc[oc] - mf[of]).abs().max().item() if coord_eq else float("nan")
-    ok = coord_eq and num_eq and vfe_max == 0.0
-
-    # timing: per-frame voxelize, C++ op vs fast (30 runs after warmup)
-    for _ in range(5):
-        voxelization(points, voxel_size, pcr, max_points, max_voxels, True)
-        voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels)
-    torch.cuda.synchronize()
-    t0 = time.perf_counter()
-    for _ in range(30):
-        voxelization(points, voxel_size, pcr, max_points, max_voxels, True)
-    torch.cuda.synchronize()
-    tc = (time.perf_counter() - t0) / 30 * 1000
-    t0 = time.perf_counter()
-    for _ in range(30):
-        voxelize_fast_gpu(points, voxel_size, pcr, max_points, max_voxels)
-    torch.cuda.synchronize()
-    tf = (time.perf_counter() - t0) / 30 * 1000
-
-    print(
-        f"{name:8s}: M={cc.shape[0]:>7d} coord_eq={coord_eq} num_eq={num_eq} "
-        f"vfe_maxdiff={vfe_max:.1e} | cpp={tc:.1f}ms fast={tf:.2f}ms ({tc / tf:.0f}x)  "
-        f"{'PASS' if ok else 'FAIL'}"
-    )
-    return ok
+    # Stronger than per-voxel means (which are permutation-invariant): compare
+    # the FULL aligned voxel tensors. Both ops fill slots in original point
+    # order, so the kept point sets AND their slot order must match exactly.
+    vox_max = (vc[oc] - vf[of]).abs().max().item() if coord_eq else float("nan")
+    return coord_eq, num_eq, vox_max
 
 
 def make_cloud(num_voxels, max_pts_per, vs, pcr, dev):
@@ -86,20 +75,56 @@ def make_cloud(num_voxels, max_pts_per, vs, pcr, dev):
     return torch.cat([xyz, torch.rand(rep.shape[0], 2, device=dev)], dim=1)
 
 
-def main():
+@unittest.skipUnless(torch.cuda.is_available(), "needs a CUDA GPU + the compiled voxel op")
+class TestVoxelizeFast(unittest.TestCase):
+    """voxelize_fast_gpu must be bit-identical to the deterministic C++ op."""
+
+    def _check(self, num_voxels, max_pts_per):
+        torch.manual_seed(0)
+        pts = make_cloud(num_voxels, max_pts_per, VS, PCR, "cuda")
+        coord_eq, num_eq, vox_max = compare(pts, VS, PCR, MNP, MAXV)
+        self.assertTrue(coord_eq, "voxel coords differ from the C++ op")
+        self.assertTrue(num_eq, "num_points_per_voxel differs from the C++ op")
+        self.assertEqual(vox_max, 0.0, f"voxel features differ (max_abs_diff={vox_max})")
+
+    def test_sparse(self):
+        # ~90k voxels (< max_voxels), <= 3 pts/voxel: no max_num_points truncation.
+        self._check(90000, 3)
+
+    def test_dense(self):
+        # up to 15 pts/voxel exercises the max_num_points=10 truncation (which
+        # points are kept must match the C++ op exactly).
+        self._check(90000, 15)
+
+
+def _benchmark():
+    """Print equivalence + per-frame voxelize timing (C++ op vs fast path)."""
     assert torch.cuda.is_available(), "needs a GPU + the compiled voxel op"
     dev = "cuda"
-    vs = [0.17, 0.17, 0.2]
-    pcr = [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0]
-    mnp, maxv = 10, 120000
-    torch.manual_seed(0)
-    # ~90k voxels (< max_voxels=120k, like a real 120m LiDAR frame); the dense
-    # case uses up to 15 points/voxel to exercise the max_num_points=10 truncation.
-    ok = True
-    ok &= check("sparse", make_cloud(90000, 3, vs, pcr, dev), vs, pcr, mnp, maxv)
-    ok &= check("dense", make_cloud(90000, 15, vs, pcr, dev), vs, pcr, mnp, maxv)
-    print("\nALL PASS" if ok else "\nFAILED")
+    for name, mpp in (("sparse", 3), ("dense", 15)):
+        torch.manual_seed(0)
+        pts = make_cloud(90000, mpp, VS, PCR, dev)
+        coord_eq, num_eq, vox_max = compare(pts, VS, PCR, MNP, MAXV)
+        for _ in range(5):
+            voxelization(pts, VS, PCR, MNP, MAXV, True)
+            voxelize_fast_gpu(pts, VS, PCR, MNP, MAXV)
+        torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        for _ in range(30):
+            voxelization(pts, VS, PCR, MNP, MAXV, True)
+        torch.cuda.synchronize()
+        tc = (time.perf_counter() - t0) / 30 * 1000
+        t0 = time.perf_counter()
+        for _ in range(30):
+            voxelize_fast_gpu(pts, VS, PCR, MNP, MAXV)
+        torch.cuda.synchronize()
+        tf = (time.perf_counter() - t0) / 30 * 1000
+        ok = coord_eq and num_eq and vox_max == 0.0
+        print(
+            f"{name:8s}: coord_eq={coord_eq} num_eq={num_eq} vox_maxdiff={vox_max:.1e} | "
+            f"cpp={tc:.1f}ms fast={tf:.2f}ms ({tc / tf:.0f}x)  {'PASS' if ok else 'FAIL'}"
+        )
 
 
 if __name__ == "__main__":
-    main()
+    _benchmark()
diff --git a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py
index 0adbb3f7d..fbee72768 100644
--- a/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py
+++ b/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py
@@ -1,4 +1,6 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+
 import torch
 from torch import nn
 from torch.autograd import Function
@@ -96,11 +98,16 @@ def voxelize_fast_gpu(points, voxel_size, point_cloud_range, max_num_points, max
     replicate in that (rare) case.
     """
     dev = points.device
+    # Grid dims from the python config on the HOST — never a CUDA tensor — so
+    # there is no per-forward device->host `.item()` sync (this path is hot).
+    # `floor(x + 0.5)` mirrors the C++ op's `round()` exactly
+    # (voxelization_cuda.cu: `grid_* = round((max - min) / voxel_*)`).
+    gx = int(math.floor((point_cloud_range[3] - point_cloud_range[0]) / voxel_size[0] + 0.5))
+    gy = int(math.floor((point_cloud_range[4] - point_cloud_range[1]) / voxel_size[1] + 0.5))
+    gz = int(math.floor((point_cloud_range[5] - point_cloud_range[2]) / voxel_size[2] + 0.5))
     vs = torch.tensor(voxel_size, device=dev, dtype=points.dtype)
     rmin = torch.tensor(point_cloud_range[:3], device=dev, dtype=points.dtype)
-    rmax = torch.tensor(point_cloud_range[3:], device=dev, dtype=points.dtype)
-    grid = torch.round((rmax - rmin) / vs).long()
-    gx, gy = int(grid[0]), int(grid[1])
+    grid = torch.tensor([gx, gy, gz], device=dev)  # small constant, no .item() sync
 
     feat_dim = points.shape[1]
     # Match the CUDA dynamic_voxelize_kernel EXACTLY: compute the voxel index for
@@ -119,14 +126,23 @@ def voxelize_fast_gpu(points, voxel_size, point_cloud_range, max_num_points, max
             points.new_zeros((0, 3), dtype=torch.int32),
             points.new_zeros((0,), dtype=torch.int32),
         )
+    # Internal grouping key only: linearize the (x, y, z) voxel index with z as
+    # the most-significant axis so a stable sort groups same-voxel points and
+    # preserves their original order. This z-major key does NOT change the
+    # output coord order (`coors` below is (x, y, z)).
     flat = coord[:, 2] * (gy * gx) + coord[:, 1] * gx + coord[:, 0]
 
     order = torch.argsort(flat, stable=True)  # stable -> in-voxel original order
     flat_s, pts_s, coord_s = flat[order], pts[order], coord[order]
     uniq, inv, counts = torch.unique_consecutive(flat_s, return_inverse=True, return_counts=True)
-    num_voxels = int(uniq.shape[0])
+    num_voxels = int(uniq.shape[0])  # tensor.shape is a host int — no device sync
     if num_voxels > max_voxels:
-        return None  # caller falls back to the C++ op
+        # Bail to the C++ op (don't just keep the first max_voxels here): the C++
+        # truncation when M > max_voxels drops voxels by FIRST-APPEARANCE order,
+        # which the sort-order fast path doesn't reproduce — keeping max_voxels
+        # here would retain a different voxel set and break bit-identity. This
+        # branch is rare in practice (M ~ 100k < max_voxels = 120k for a 120m sweep).
+        return None
 
     seg_start = torch.zeros(num_voxels, dtype=torch.long, device=dev)
     seg_start[1:] = torch.cumsum(counts, 0)[:-1]
@@ -136,7 +152,11 @@ def voxelize_fast_gpu(points, voxel_size, point_cloud_range, max_num_points, max
     voxels = torch.zeros(num_voxels, max_num_points, feat_dim, device=dev, dtype=pts.dtype)
     voxels[inv[keep], rank[keep]] = pts_s[keep]
     num_points_per_voxel = counts.clamp(max=max_num_points).to(torch.int32)
-    coors = coord_s[seg_start].to(torch.int32)  # (M, 3) = (x, y, z), matching the C++ op
+    # (x, y, z) order — matches this repo's compiled hard_voxelize, whose
+    # dynamic_voxelize_kernel writes coors_offset[0..2] = c_x, c_y, c_z
+    # (voxelization_cuda.cu). Any (z, y, x) flip for the sparse encoder happens
+    # downstream and applies equally to this and the C++ op's output.
+    coors = coord_s[seg_start].to(torch.int32)  # (M, 3) = (x, y, z)
     return voxels, coors, num_points_per_voxel
 
 

From d93cfe48f7d66e6b2385b98788aaa07af2ed4833 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 2 Jun 2026 01:15:26 +0000
Subject: [PATCH 5/5] ci(pre-commit): autofix

---
 projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py
index dfd1af59a..fe4233701 100644
--- a/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py
+++ b/projects/BEVFusion/bevfusion/ops/voxel/test_voxelize_fast.py
@@ -20,7 +20,6 @@
 import unittest
 
 import torch
-
 from bevfusion.ops.voxel.voxelize import voxelization, voxelize_fast_gpu
 
 VS = [0.17, 0.17, 0.2]