From 24ac20121299a15d624251987a2ed0bae7e92e89 Mon Sep 17 00:00:00 2001
From: Prathmesh Atkale <atkaleprathmesh@gmail.com>
Date: Mon, 23 Mar 2026 01:01:36 +0530
Subject: [PATCH 1/3] feat - added vendor agnostic sonar backend

Signed-off-by: Prathmesh Atkale <atkaleprathmesh@gmail.com>
---
 .../multibeam_sonar/CMakeLists.txt            |  50 ++-
 .../multibeam_sonar/sonar_compute_wgpu.cc     | 329 ++++++++++++++++
 .../multibeam_sonar/sonar_compute_wgpu.hh     | 135 +++++++
 .../multibeam_sonar/wgpu_backend/Cargo.toml   |  16 +
 .../multibeam_sonar/wgpu_backend/build.rs     |   9 +
 .../wgpu_backend/cbindgen.toml                |   3 +
 .../wgpu_backend/include/sonar_wgpu.h         |  76 ++++
 .../wgpu_backend/shaders/backscatter.wgsl     | 194 ++++++++++
 .../wgpu_backend/shaders/convert.wgsl         |  36 ++
 .../wgpu_backend/shaders/fft.wgsl             | 111 ++++++
 .../wgpu_backend/shaders/matmul.wgsl          |  78 ++++
 .../multibeam_sonar/wgpu_backend/src/lib.rs   | 154 ++++++++
 .../wgpu_backend/src/physics_engine.rs        | 353 ++++++++++++++++++
 13 files changed, 1535 insertions(+), 9 deletions(-)
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.hh 
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/Cargo.toml
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/build.rs
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/cbindgen.toml
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/include/sonar_wgpu.h
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/backscatter.wgsl
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/convert.wgsl
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/fft.wgsl
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/matmul.wgsl
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/lib.rs
 create mode 100644 gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/physics_engine.rs
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt
index c5a1d2d0..4a46e008 100644
--- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt
@@ -1,23 +1,16 @@
 cmake_minimum_required(VERSION 3.11.0 FATAL_ERROR)
-
 # Suppress developer warnings for the entire workspace.
-
 set(CMAKE_SUPPRESS_DEVELOPER_WARNINGS TRUE CACHE BOOL
     "Suppress developer warnings" FORCE)
 # Set CMP0144 to NEW to ensure find_package
 # uses upper-case <PACKAGENAME>_ROOT variables.
-
 cmake_policy(SET CMP0144 NEW)
-
 project(multibeam_sonar)
-
 set(CUDA_ARCHITECTURE "60" CACHE STRING "Target CUDA SM version")
-
 find_package(ament_cmake REQUIRED)
 find_package(CUDAToolkit QUIET)
 
 if(CUDAToolkit_FOUND)
-
   enable_language(CUDA)
   find_package(CUDA REQUIRED)
   message(STATUS "CUDA found, enabling CUDA support.")
@@ -47,17 +40,44 @@ if(CUDAToolkit_FOUND)
     set(GZ_RENDERING_TARGET gz-rendering${GZ_RENDERING_VER}-ogre)
     add_definitions(-DWITH_OGRE)
   endif()
-
   if(TARGET gz-rendering${GZ_RENDERING_VER}::ogre2)
     set(HAVE_OGRE2 TRUE)
     set(GZ_RENDERING_TARGET gz-rendering${GZ_RENDERING_VER}-ogre2)
     add_definitions(-DWITH_OGRE2)
   endif()
 
+  # Detected at configure time by presence of cargo.
+  # If cargo is absent the wgpu sources are simply excluded — CUDA build is
+  # completely unaffected.
+  find_program(CARGO_EXECUTABLE cargo DOC "Rust cargo package manager")
+  if(CARGO_EXECUTABLE)
+    message(STATUS "[multibeam_sonar] cargo found — wgpu backend enabled.")
+    set(WGPU_DIR "${CMAKE_CURRENT_SOURCE_DIR}/wgpu_backend")
+    set(WGPU_LIB "${WGPU_DIR}/target/release/libsonar_wgpu_backend.a")
+    set(WGPU_HDR "${WGPU_DIR}/include/sonar_wgpu.h")
+    add_custom_command(
+      OUTPUT  "${WGPU_LIB}" "${WGPU_HDR}"
+      COMMAND "${CARGO_EXECUTABLE}" build --release
+      WORKING_DIRECTORY "${WGPU_DIR}"
+      COMMENT "Building wgpu sonar backend (Rust/cargo)..."
+      VERBATIM
+    )
+    add_custom_target(sonar_wgpu_build DEPENDS "${WGPU_LIB}")
+    add_library(sonar_wgpu_static STATIC IMPORTED GLOBAL)
+    set_target_properties(sonar_wgpu_static PROPERTIES
+      IMPORTED_LOCATION "${WGPU_LIB}")
+    add_dependencies(sonar_wgpu_static sonar_wgpu_build)
+    set(HAVE_WGPU ON)
+  else()
+    message(STATUS "[multibeam_sonar] cargo not found — wgpu backend disabled, CPU fallback active.")
+    set(HAVE_WGPU OFF)
+  endif()
+
   add_library(${PROJECT_NAME}
     SHARED
       MultibeamSonarSensor.cc
       sonar_calculation_cuda.cu
+      sonar_compute_wgpu.cc        
   )
 
   set_target_properties(${PROJECT_NAME}
@@ -92,6 +112,18 @@ if(CUDAToolkit_FOUND)
     ${CUBLAS_LIB}
   )
 
+  # Link wgpu static lib if built 
+  if(HAVE_WGPU)
+    target_compile_definitions(${PROJECT_NAME} PRIVATE HAVE_WGPU_BACKEND)
+    target_include_directories(${PROJECT_NAME} PRIVATE "${WGPU_DIR}/include")
+    add_dependencies(${PROJECT_NAME} sonar_wgpu_build)
+    target_link_libraries(${PROJECT_NAME}
+      sonar_wgpu_static
+      dl        # vulkan loader
+      pthread   # wgpu async runtime
+    )
+  endif()
+
   install(TARGETS ${PROJECT_NAME}
     DESTINATION lib/${PROJECT_NAME}
   )
@@ -120,4 +152,4 @@ else()
                 "Skipping CUDA-specific targets")
 endif()
 
-ament_package()
+ament_package()
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc
new file mode 100644
index 00000000..a3cf7915
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc
@@ -0,0 +1,329 @@
+#include <memory>     // std::unique_ptr
+#include <chrono>     // std::chrono
+#include <complex>    // std::complex
+#include <cmath>      // std::cos, sin, sqrt, exp
+#include <algorithm>  // std::max
+#include <vector>     // std::vector
+#include <iostream>   // std::cout, cerr
+#include <cstdint>    // uint32_t, uint64_t
+#include <opencv2/core.hpp>
+#define _USE_MATH_DEFINES
+#include <cmath>
+using namespace std;
+// C ABI bridge (Rust <-> C++)
+// These are generated by cbindgen from the Rust backend.
+#ifdef HAVE_WGPU_BACKEND
+extern "C"
+{
+  void * sonar_wgpu_create(
+    uint32_t n_beams, uint32_t n_rays, uint32_t n_freq,
+    float sound_speed, float bandwidth,
+    float max_range, float attenuation,
+    float source_level, float sensor_gain,
+    float h_fov, float v_fov,
+    uint64_t seed);
+
+  float * sonar_wgpu_compute(
+    void * engine,
+    const float * depth,
+    const float * normals,
+    const float * refl,
+    const float * beam_corr,
+    uint32_t n_beams, uint32_t n_rays, uint32_t n_freq,
+    uint64_t frame,
+    float beam_corr_sum);
+
+  void sonar_wgpu_free(float * ptr, size_t len);
+  void sonar_wgpu_destroy(void * engine);
+}   
+#endif
+
+namespace gz
+{
+namespace sensors
+{
+
+
+// CPU backend (reference path, always available)
+// Implements Eq.14 (amplitude) + Eq.8 (phase) without speckle. 
+class CpuComputeBackend : public SonarComputeBackend
+{
+public:
+  const char * Name() const override { return "cpu"; }
+
+  bool Initialize(const SonarComputeInput &) override { return true; }
+
+  bool Compute(const SonarComputeInput & input,
+               SonarComputeOutput & output) override
+  {
+    if (!input.depthImage || !input.normalImage || !input.reflectivityImage)
+      return false;
+
+    const int nb = input.nBeams;
+    const int nr = input.nRays;
+    const int nf = input.nFreq;
+
+    output.nBeams = nb;
+    output.nFreq  = nf;
+    output.beamSpectrum.assign(
+      static_cast<size_t>(nb) * nf, std::complex<float>(0.f, 0.f));
+
+    const float c  = static_cast<float>(input.soundSpeed);
+    const float B  = input.bandwidth;
+    const float df = B / static_cast<float>(nf);
+    const float pi = static_cast<float>(M_PI);
+
+    auto t0 = std::chrono::high_resolution_clock::now();
+
+    for (int b = 0; b < nb; ++b)
+    {
+      for (int ray = 0; ray < nr; ++ray)
+      {
+        if (ray >= input.depthImage->rows || b >= input.depthImage->cols)
+          continue;
+
+        const float r = input.depthImage->at<float>(ray, b);
+        if (r <= 0.f || r > static_cast<float>(input.maxDistance))
+          continue;
+
+        const float mu = input.reflectivityImage->at<float>(ray, b);
+        const cv::Vec3f n = input.normalImage->at<cv::Vec3f>(ray, b);
+
+        // cos(α) approximation from surface normal
+        const float cos_inc = std::abs(n[2]);
+
+        // projected footprint: r^2 * dθ * dφ
+        const float dth  = input.hFOV / std::max(nb - 1, 1);
+        const float dphi = input.vFOV / std::max(nr - 1, 1);
+        const float dA   = r * r * dth * dphi;
+
+        // transmission loss (spreading + absorption)
+        const float TL = std::exp(-static_cast<float>(input.attenuation) * r) / r;
+
+        // Eq.14 amplitude
+        const float A = std::sqrt(mu) * cos_inc * std::sqrt(dA) * TL;
+
+        for (int f = 0; f < nf; ++f)
+        {
+          float freq;
+          if (nf % 2 == 0)
+            freq = df * (-static_cast<float>(nf) +
+                         2.f * (static_cast<float>(f) + 1.f)) / 2.f;
+          else
+            freq = df * (-(static_cast<float>(nf) - 1.f) +
+                          2.f * (static_cast<float>(f) + 1.f)) / 2.f;
+
+          const float k   = 2.f * pi * freq / c;
+          const float phi = 2.f * r * k;   // two-way path
+
+          output.At(b, f) +=
+            std::complex<float>(A * std::cos(phi), A * std::sin(phi));
+        }
+      }
+    }
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    output.computeMicros =
+      std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+
+    return true;
+  }
+};
+
+
+// Backend factory 
+std::unique_ptr<SonarComputeBackend>
+CreateComputeBackend(const std::string & name)
+{
+  if (name == "wgpu")
+    return std::make_unique<WgpuComputeBackend>();
+
+  // CUDA backend handled elsewhere
+  return std::make_unique<CpuComputeBackend>();
+}
+
+// wgpu backend
+const char * WgpuComputeBackend::Name() const { return "wgpu"; }
+
+// Probe GPU once during init
+bool WgpuComputeBackend::Initialize(const SonarComputeInput &)
+{
+#ifndef HAVE_WGPU_BACKEND
+  std::cerr << "[sonar_wgpu] not compiled, using CPU.\n";
+  gpuAvailable_ = false;
+  initialized_  = true;
+  return true;
+#else
+  void * probe = sonar_wgpu_create(
+    1u, 1u, 4u,
+    1500.f, 2950.f,
+    10.f, 0.f,
+    220.f, 1.f,
+    1.5708f, 0.3491f,
+    1u);
+
+  if (probe)
+  {
+    sonar_wgpu_destroy(probe);
+    gpuAvailable_ = true;
+    std::cout << "[sonar_wgpu] GPU backend ready.\n";
+  }
+  else
+  {
+    gpuAvailable_ = false;
+    std::cerr << "[sonar_wgpu] GPU unavailable, falling back to CPU.\n";
+  }
+
+  initialized_ = true;
+  return true;
+#endif
+}
+
+
+// Compute 
+bool WgpuComputeBackend::Compute(
+  const SonarComputeInput & input,
+  SonarComputeOutput & output)
+{
+  if (!initialized_)
+    return false;
+
+  static void * engine = nullptr;
+  static int e_nb = 0, e_nr = 0, e_nf = 0;
+
+  // fallback path
+  if (!engine)
+  {
+    gpuAvailable_ = false;
+
+    if (!cpuFallback_){
+        cpuFallback_ = std::make_unique<CpuComputeBackend>();
+        cpuFallback_->Initialize(input);
+    }
+
+    return cpuFallback_->Compute(input, output);
+  }
+
+#ifndef HAVE_WGPU_BACKEND
+  return false;
+#else
+
+  if (!input.depthImage || !input.normalImage || !input.reflectivityImage)
+    return false;
+
+  const int nb = input.nBeams;
+  const int nr = input.nRays;
+  const int nf = input.nFreq;
+
+  auto t0 = std::chrono::high_resolution_clock::now();
+
+  // flatten OpenCV layout → contiguous [beam * nr + ray]
+  std::vector<float> depthFlat(nb * nr, 0.f);
+  std::vector<float> reflFlat (nb * nr, 0.f);
+  std::vector<float> normalFlat(nb * nr * 3, 0.f);
+
+  for (int b = 0; b < nb; ++b)
+  {
+    for (int r = 0; r < nr; ++r)
+    {
+      if (r >= input.depthImage->rows || b >= input.depthImage->cols)
+        continue;
+
+      const size_t idx = static_cast<size_t>(b) * nr + r;
+
+      depthFlat[idx] = input.depthImage->at<float>(r, b);
+      reflFlat[idx]  = input.reflectivityImage->at<float>(r, b);
+
+      const cv::Vec3f n = input.normalImage->at<cv::Vec3f>(r, b);
+      normalFlat[idx*3+0] = n[0];
+      normalFlat[idx*3+1] = n[1];
+      normalFlat[idx*3+2] = n[2];
+    }
+  }
+
+  // beam correction matrix (identity if missing)
+  std::vector<float> beamCorrFlat(nb * nb, 0.f);
+  if (input.beamCorrector)
+  {
+    for (int r = 0; r < nb; ++r)
+      for (int c = 0; c < nb; ++c)
+        beamCorrFlat[r * nb + c] = input.beamCorrector[r][c];
+  }
+  else
+  {
+    for (int i = 0; i < nb; ++i)
+      beamCorrFlat[i * nb + i] = 1.f;
+  }
+
+  // persistent engine (recreated on dimension change)
+  static void * engine = nullptr;
+  static int    e_nb = 0, e_nr = 0, e_nf = 0;
+
+  if (!engine || e_nb != nb || e_nr != nr || e_nf != nf)
+  {
+    if (engine)
+      sonar_wgpu_destroy(engine);
+
+    engine = sonar_wgpu_create(
+      nb, nr, nf,
+      input.soundSpeed,
+      input.bandwidth,
+      input.maxDistance,
+      input.attenuation,
+      input.sourceLevel,
+      input.sensorGain,
+      input.hFOV,
+      input.vFOV,
+      input.seed);
+
+    e_nb = nb; e_nr = nr; e_nf = nf;
+
+    if (!engine)
+    {
+      gpuAvailable_ = false;
+      return Compute(input, output); // retry via CPU
+    }
+  }
+
+  float * result = sonar_wgpu_compute(
+    engine,
+    depthFlat.data(),
+    normalFlat.data(),
+    reflFlat.data(),
+    beamCorrFlat.data(),
+    nb, nr, nf,
+    input.frameIndex,
+    input.beamCorrectorSum > 0.f ?
+      input.beamCorrectorSum : static_cast<float>(nb));
+
+  if (!result)
+  {
+    return Compute(input, output); // transient fallback
+  }
+
+  // unpack [re, im]
+  output.nBeams = nb;
+  output.nFreq  = nf;
+  output.beamSpectrum.resize(nb * nf);
+
+  for (int b = 0; b < nb; ++b)
+  {
+    for (int f = 0; f < nf; ++f)
+    {
+      const size_t i = (static_cast<size_t>(b) * nf + f) * 2;
+      output.At(b, f) = { result[i], result[i + 1] };
+    }
+  }
+
+  sonar_wgpu_free(result, nb * nf * 2);
+
+  auto t1 = std::chrono::high_resolution_clock::now();
+  output.computeMicros =
+    std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+
+  return true;
+#endif
+}
+
+} // namespace sensors
+} // namespace gz
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.hh  b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.hh 
new file mode 100644
index 00000000..8e40f6ad
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.hh 	
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2025 Open Source Robotics Foundation
+ *
+ * Licensed under the Apache License, Version 2.0
+ */
+
+#ifndef GZ_SENSORS_SONAR_COMPUTE_WGPU_HH_
+#define GZ_SENSORS_SONAR_COMPUTE_WGPU_HH_
+
+#include <complex>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+
+namespace gz
+{
+namespace sensors
+{
+
+// Per-frame data passed into the sonar pipeline
+// (directly mirrors Gazebo depth + normal render outputs)
+struct SonarComputeInput
+{
+  // Rendered images (row = ray, col = beam)
+  const cv::Mat * depthImage        = nullptr;   // distance [m]
+  const cv::Mat * normalImage       = nullptr;   // surface normals
+  const cv::Mat * reflectivityImage = nullptr;   // μ ∈ [0,1]
+
+  // Beam correction matrix W (applied after scattering)
+  float ** beamCorrector    = nullptr;
+  float    beamCorrectorSum = 1.0f;
+
+  // Optional windowing (used before IFFT if needed)
+  float * window = nullptr;
+
+  // Dimensions
+  int nBeams = 0;
+  int nRays  = 0;
+  int nFreq  = 0;
+  int raySkips = 0;
+
+  // Acoustic parameters (from sonar model)
+  double soundSpeed  = 1500.0;
+  double maxDistance = 60.0;
+  double sourceLevel = 220.0;
+  double attenuation = 0.0;
+  double bandwidth   = 2950.0;
+
+  // Sensor geometry
+  float sensorGain = 1.0f;
+  float hFOV = 1.5708f;   // 90°
+  float vFOV = 0.3491f;   // 20°
+
+  // Frame-based seed (used for speckle / randomness on GPU)
+  uint64_t frameIndex = 0;
+  uint64_t seed       = 42;
+};
+
+// Output: complex spectrum per beam
+// (frequency-domain before optional IFFT → range profile)
+struct SonarComputeOutput
+{
+  int nBeams = 0;
+  int nFreq  = 0;
+
+  // Layout: [beam * nFreq + freq]
+  std::vector<std::complex<float>> beamSpectrum;
+
+  double computeMicros = 0.0;
+
+  inline std::complex<float> & At(int b, int f)
+  {
+    return beamSpectrum[static_cast<size_t>(b) * nFreq + f];
+  }
+};
+
+// Backend interface
+// Allows runtime switching: CPU / CUDA / WGPU
+class SonarComputeBackend
+{
+public:
+  virtual ~SonarComputeBackend() = default;
+
+  // "wgpu", "cuda", "cpu"
+  virtual const char * Name() const = 0;
+
+  // Allocate GPU / buffers etc.
+  virtual bool Initialize(const SonarComputeInput & input) = 0;
+
+  // Run one frame
+  virtual bool Compute(
+    const SonarComputeInput & input,
+    SonarComputeOutput & output) = 0;
+};
+
+// Factory (used by sensor plugin)
+std::unique_ptr<SonarComputeBackend>
+CreateComputeBackend(const std::string & name);
+
+// WGPU backend (Rust + WGSL)
+// Pipeline follows Choi et al. sonar model:
+//
+//   1. Backscatter     → amplitude + phase (Eq.14, Eq.8)
+//   2. Beam correction → W·P
+//   3. FFT             → frequency → time (range)
+//
+// Falls back to CPU if GPU is unavailable or fails at runtime.
+class WgpuComputeBackend : public SonarComputeBackend
+{
+public:
+  WgpuComputeBackend()  = default;
+  ~WgpuComputeBackend() = default;
+
+  const char * Name() const override;
+
+  bool Initialize(const SonarComputeInput & input) override;
+
+  bool Compute(
+    const SonarComputeInput & input,
+    SonarComputeOutput & output) override;
+
+private:
+  bool initialized_  = false;
+  bool gpuAvailable_ = false;
+
+  // Used when GPU fails mid-run (device lost, bad config, etc.)
+  std::unique_ptr<SonarComputeBackend> cpuFallback_;
+};
+
+}  // namespace sensors
+}  // namespace gz
+
+#endif
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/Cargo.toml b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/Cargo.toml
new file mode 100644
index 00000000..d8f6c1e8
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name    = "sonar_wgpu_backend"
+version = "0.1.0"
+edition = "2021"
+
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+wgpu     = "0.20"
+bytemuck = { version = "1.14", features = ["derive"] }
+pollster = "0.3"
+rustfft  = "6.2"
+
+[build-dependencies]
+cbindgen = "0.26"
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/build.rs b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/build.rs
new file mode 100644
index 00000000..8bfbc606
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/build.rs
@@ -0,0 +1,9 @@
+fn main() {
+    let crate_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap();
+    cbindgen::Builder::new()
+        .with_crate(&crate_dir)
+        .with_config(cbindgen::Config::from_file("cbindgen.toml").unwrap())
+        .generate()
+        .expect("cbindgen failed")
+        .write_to_file("include/sonar_wgpu.h");
+}
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/cbindgen.toml b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/cbindgen.toml
new file mode 100644
index 00000000..763c4d2f
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/cbindgen.toml
@@ -0,0 +1,3 @@
+language = "C"
+include_guard = "SONAR_WGPU_H"
+autogen_warning = "/* Auto-generated by cbindgen. Do not edit. */"
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/include/sonar_wgpu.h b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/include/sonar_wgpu.h
new file mode 100644
index 00000000..61e72186
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/include/sonar_wgpu.h
@@ -0,0 +1,76 @@
+#ifndef SONAR_WGPU_H
+#define SONAR_WGPU_H
+
+/* Generated via cbindgen — interface between C++ plugin and Rust backend */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Opaque handle to Rust-side engine */
+typedef struct SonarPhysicsEngine SonarPhysicsEngine;
+
+
+/* Create GPU sonar engine.
+   Returns NULL if GPU init fails (caller should fallback to CPU). */
+SonarPhysicsEngine *sonar_wgpu_create(uint32_t n_beams,
+                                      uint32_t n_rays,
+                                      uint32_t n_freq,
+                                      float sound_speed,
+                                      float bandwidth,
+                                      float max_range,
+                                      float attenuation,
+                                      float source_level,
+                                      float sensor_gain,
+                                      float h_fov,
+                                      float v_fov,
+                                      uint64_t seed);
+
+
+/* Run one frame.
+   Output layout: [re0, im0, re1, im1, ...] (beam-major).
+   Size = n_beams * n_freq * 2.
+   Caller owns memory → free with sonar_wgpu_free(). */
+float *sonar_wgpu_compute(SonarPhysicsEngine *engine,
+                          const float *depth,
+                          const float *normals,
+                          const float *refl,
+                          const float *beam_corr,
+                          uint32_t n_beams,
+                          uint32_t n_rays,
+                          uint32_t n_freq,
+                          uint64_t frame,
+                          float beam_corr_sum);
+
+
+/* Free buffer returned by compute */
+void sonar_wgpu_free(float *ptr, uintptr_t len);
+
+
+/* Destroy engine + release GPU resources */
+void sonar_wgpu_destroy(void *engine);
+
+
+/* --- CPU / legacy path (kept for compatibility) --- */
+
+SonarPhysicsEngine *sonar_physics_create(uint32_t n_beams,
+                                         uint32_t n_rays,
+                                         uint32_t n_freq,
+                                         float sound_speed,
+                                         float bandwidth,
+                                         float max_range,
+                                         float attenuation,
+                                         float h_fov,
+                                         float v_fov);
+
+void sonar_physics_destroy(SonarPhysicsEngine *engine);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SONAR_WGPU_H */
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/backscatter.wgsl b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/backscatter.wgsl
new file mode 100644
index 00000000..54bae29a
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/backscatter.wgsl
@@ -0,0 +1,194 @@
+// backscatter.wgsl
+// Based on Choi et al. (2021)
+// Uses Eq.14 for per-ray amplitude and Eq.8 for frequency accumulation
+
+struct Params {
+    n_beams: u32,
+    n_rays: u32,
+    n_freq: u32,
+    _pad0: u32,
+
+    sound_speed: f32,
+    bandwidth: f32,
+    max_range: f32,
+    attenuation: f32,
+
+    h_fov: f32,
+    v_fov: f32,
+    mu_default: f32,
+    _pad1: f32,
+
+    seed: u32,
+    frame: u32,
+    _pad2: u32,
+    _pad3: u32,
+};
+
+@group(0) @binding(0) var<uniform> params: Params;
+@group(0) @binding(1) var<storage, read> depth: array<f32>;     // per-ray distance
+@group(0) @binding(2) var<storage, read> normals: array<f32>;   // packed xyz normals
+@group(0) @binding(3) var<storage, read> refl: array<f32>;      // reflectivity per ray
+@group(0) @binding(4) var<storage, read_write> out_re: array<atomic<i32>>;
+@group(0) @binding(5) var<storage, read_write> out_im: array<atomic<i32>>;
+
+const PI: f32 = 3.141592653589793;
+const SCALE: f32 = 1048576.0; // fixed-point scale for atomics
+
+//  RNG (Philox-style counter RNG, deterministic per (frame, beam, ray)) ---
+fn mulhilo32(a: u32, b: u32) -> vec2<u32> {
+    // manual 32-bit multiply split (WGSL doesn't expose 64-bit directly)
+    let a_lo = a & 0xFFFFu;
+    let a_hi = a >> 16u;
+    let b_lo = b & 0xFFFFu;
+    let b_hi = b >> 16u;
+
+    let p0 = a_lo * b_lo;
+    let p1 = a_hi * b_lo;
+    let p2 = a_lo * b_hi;
+    let p3 = a_hi * b_hi;
+
+    let mid = (p0 >> 16u) + (p1 & 0xFFFFu) + (p2 & 0xFFFFu);
+    let hi = p3 + (p1 >> 16u) + (p2 >> 16u) + (mid >> 16u);
+
+    return vec2<u32>(hi, a * b);
+}
+
+fn philox_round(c: vec4<u32>, k: vec2<u32>) -> vec4<u32> {
+    // one Feistel round
+    let r0 = mulhilo32(0xD2511F53u, c.x);
+    let r1 = mulhilo32(0xCD9E8D57u, c.z);
+    return vec4<u32>(r1.x ^ c.y ^ k.x, r1.y, r0.x ^ c.w ^ k.y, r0.y);
+}
+
+fn philox4x32_10(ctr: vec4<u32>, key: vec2<u32>) -> vec4<u32> {
+    // 10 rounds → good statistical quality, still fast on GPU
+    var c = ctr;
+    var k = key;
+
+    for (var i = 0; i < 9; i++) {
+        c = philox_round(c, k);
+        k.x += 0x9E3779B9u; // Weyl sequence increment
+        k.y += 0xBB67AE85u;
+    }
+    c = philox_round(c, k);
+    return c;
+}
+
+fn u32_to_unit(x: u32) -> f32 {
+    // map to (0,1] to avoid log(0)
+    return f32(x) * (1.0 / 4294967296.0) + (0.5 / 4294967296.0);
+}
+
+fn philox_normal4(seed: u32, frame: u32, subseq: u32) -> vec4<f32> {
+    // Box-Muller: convert uniform → Gaussian (needed for speckle)
+    let raw = philox4x32_10(
+        vec4<u32>(frame, 0u, subseq, 0u),
+        vec2<u32>(seed, 0u)
+    );
+
+    let u1 = u32_to_unit(raw.x);
+    let u2 = u32_to_unit(raw.y);
+    let u3 = u32_to_unit(raw.z);
+    let u4 = u32_to_unit(raw.w);
+
+    let r1 = sqrt(-2.0 * log(u1));
+    let r2 = sqrt(-2.0 * log(u3));
+
+    return vec4<f32>(
+        r1 * sin(2.0 * PI * u2),
+        r1 * cos(2.0 * PI * u2),
+        r2 * sin(2.0 * PI * u4),
+        r2 * cos(2.0 * PI * u4)
+    );
+}
+
+@compute @workgroup_size(8, 8, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+
+    let beam = gid.x;
+    let ray  = gid.y;
+
+    // guard against over-dispatch
+    if (beam >= params.n_beams || ray >= params.n_rays) {
+        return;
+    }
+
+    let idx = beam * params.n_rays + ray;
+    let r = depth[idx];
+
+    // skip invalid or out-of-range rays early (cheap reject)
+    if (r <= 0.001 || r > params.max_range) {
+        return;
+    }
+
+    // fetch surface info
+    let nx = normals[idx * 3u + 0u];
+    let ny = normals[idx * 3u + 1u];
+    let nz = normals[idx * 3u + 2u];
+    let mu = max(refl[idx], 0.0); // clamp just in case input is noisy
+
+    // convert pixel position → direction (sensor frame)
+    let beam_ang = params.h_fov * (f32(beam) / max(f32(params.n_beams) - 1.0, 1.0) - 0.5);
+    let ray_ang  = params.v_fov * (f32(ray)  / max(f32(params.n_rays)  - 1.0, 1.0) - 0.5);
+
+    let rd_x = sin(beam_ang) * cos(ray_ang);
+    let rd_y = sin(ray_ang);
+    let rd_z = cos(beam_ang) * cos(ray_ang);
+
+    // incidence term |dot(ray, normal)| → energy drop at grazing angles
+    let cos_inc = abs(rd_x * nx + rd_y * ny + rd_z * nz);
+
+    // differential area (solid angle scaled by range²)
+    let d_theta_h = params.h_fov / max(f32(params.n_beams) - 1.0, 1.0);
+    let d_theta_v = params.v_fov / max(f32(params.n_rays)  - 1.0, 1.0);
+    let dA = r * r * d_theta_h * d_theta_v;
+
+    // attenuation + spreading (simple model)
+    let TL = exp(-params.attenuation * r) / r;
+
+    // Eq.14 deterministic amplitude term
+    let A_det = sqrt(mu) * cos_inc * sqrt(dA) * TL;
+
+    // stochastic component → produces speckle when summed
+    let subseq = beam * params.n_rays + ray;
+    let xi = philox_normal4(params.seed, params.frame, subseq);
+
+    let amp_re = A_det * (xi.x / sqrt(2.0));
+    let amp_im = A_det * (xi.y / sqrt(2.0));
+
+    let delta_f  = params.bandwidth / f32(params.n_freq);
+    let n_freq_f = f32(params.n_freq);
+    let is_even  = (params.n_freq & 1u) == 0u;
+
+    for (var f = 0u; f < params.n_freq; f++) {
+
+        // match fft-style frequency layout (centered around 0)
+        var freq: f32;
+        if (is_even) {
+            freq = delta_f * (-n_freq_f + 2.0 * (f32(f) + 1.0)) / 2.0;
+        } else {
+            freq = delta_f * (-(n_freq_f - 1.0) + 2.0 * (f32(f) + 1.0)) / 2.0;
+        }
+
+        // Eq.8 phase term (two-way travel)
+        let k   = 2.0 * PI * freq / params.sound_speed;
+        let phi = 2.0 * r * k;
+
+        let c = cos(phi);
+        let s = sin(phi);
+
+        // complex multiply (manual since no complex type)
+        let contrib_re = amp_re * c - amp_im * s;
+        let contrib_im = amp_re * s + amp_im * c;
+
+        let out_idx = beam * params.n_freq + f;
+
+        // clamp before converting → avoid overflow in atomics
+        let re_i32 = i32(clamp(contrib_re * SCALE, -2147483520.0, 2147483520.0));
+        let im_i32 = i32(clamp(contrib_im * SCALE, -2147483520.0, 2147483520.0));
+
+        // accumulate across rays (parallel safe)
+        atomicAdd(&out_re[out_idx], re_i32);
+        atomicAdd(&out_im[out_idx], im_i32);
+    }
+}
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/convert.wgsl b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/convert.wgsl
new file mode 100644
index 00000000..a059aaca
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/convert.wgsl
@@ -0,0 +1,36 @@
+// convert.wgsl
+// takes the accumulated i32 buffers from scatter pass
+// converts them back to float so next stages can use them
+
+struct Params {
+    n_elements: u32,   // total entries (beams * freq bins)
+    scale:      f32,   // same scale factor used during atomic adds
+    _pad0:      u32,   // alignment padding (required for uniform layout)
+    _pad1:      u32,
+};
+
+@group(0) @binding(0) var<uniform> params: Params;
+
+// input buffers (fixed-point values)
+@group(0) @binding(1) var<storage, read> in_re: array<i32>;
+@group(0) @binding(2) var<storage, read> in_im: array<i32>;
+
+// output buffers (converted back to float)
+@group(0) @binding(3) var<storage, read_write> out_re: array<f32>;
+@group(0) @binding(4) var<storage, read_write> out_im: array<f32>;
+
+@compute @workgroup_size(256)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let i = gid.x;
+
+    // guard against over-dispatch
+    if (i >= params.n_elements) {
+        return;
+    }
+
+    // undo fixed-point scaling
+    let s = params.scale;
+
+    out_re[i] = f32(in_re[i]) / s;
+    out_im[i] = f32(in_im[i]) / s;
+}
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/fft.wgsl b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/fft.wgsl
new file mode 100644
index 00000000..48bf9a20
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/fft.wgsl
@@ -0,0 +1,111 @@
+// fft.wgsl
+// Spectrum → range conversion using radix-2 FFT
+// Based on beam spectrum from backscatter pass (Eq.8 → time domain)
+
+struct Params {
+    n_beams: u32,
+    n_freq:  u32,
+    log2_n:  u32,   // precomputed log2(n_freq)
+    _pad:    u32,
+};
+
+@group(0) @binding(0) var<uniform> params: Params;
+@group(0) @binding(1) var<storage, read_write> p_re: array<f32>;
+@group(0) @binding(2) var<storage, read_write> p_im: array<f32>;
+
+// shared memory for one beam
+var<workgroup> smem_re: array<f32, 4096>;
+var<workgroup> smem_im: array<f32, 4096>;
+
+const PI: f32 = 3.141592653589793;
+
+// reverse lowest 'bits' bits (needed for FFT input reordering)
+fn bit_reverse(v: u32, bits: u32) -> u32 {
+    var x = v;
+    var y = 0u;
+
+    for (var i = 0u; i < bits; i++) {
+        y = (y << 1u) | (x & 1u);
+        x >>= 1u;
+    }
+    return y;
+}
+
+// one workgroup = one beam
+@compute @workgroup_size(256, 1, 1)
+fn main(
+    @builtin(workgroup_id) wg: vec3<u32>,
+    @builtin(local_invocation_id) lid: vec3<u32>
+) {
+    let beam = wg.x;
+
+    // simple guard (also enforces shared memory limit)
+    if (beam >= params.n_beams || params.n_freq > 4096u) {
+        return;
+    }
+
+    let n = params.n_freq;
+    let base = beam * n;
+
+    //  Step 1: load + bit-reversal 
+    // FFT expects bit-reversed ordering before butterfly stages
+    // each thread loads multiple elements (stride = workgroup size)
+    for (var i = lid.x; i < n; i += 256u) {
+        let j = bit_reverse(i, params.log2_n);
+
+        // write directly into shared memory in reordered form
+        smem_re[j] = p_re[base + i];
+        smem_im[j] = p_im[base + i];
+    }
+    workgroupBarrier();
+
+    //  Step 2: butterfly stages 
+    // log2(n) stages, each doubling the merge size
+    for (var s = 0u; s < params.log2_n; s++) {
+
+        let half   = 1u << s;        // size of sub-FFT
+        let stride = half << 1u;     // full butterfly width
+
+        // total butterflies per stage = n/2
+        for (var i = lid.x; i < n / 2u; i += 256u) {
+
+            // map flat index → butterfly indices
+            let group = i / half;    // which block
+            let j     = i % half;    // position inside block
+
+            let i0 = group * stride + j;   // top element
+            let i1 = i0 + half;            // bottom element
+
+            // twiddle factor: exp(-i * 2π * j / stride)
+            // negative sign → forward FFT convention
+            let angle = -2.0 * PI * f32(j) / f32(stride);
+            let wr = cos(angle);
+            let wi = sin(angle);
+
+            // t = W * x[i1]
+            let tr = smem_re[i1] * wr - smem_im[i1] * wi;
+            let ti = smem_re[i1] * wi + smem_im[i1] * wr;
+
+            // butterfly combine
+            let ur = smem_re[i0];
+            let ui = smem_im[i0];
+
+            smem_re[i0] = ur + tr;
+            smem_im[i0] = ui + ti;
+
+            smem_re[i1] = ur - tr;
+            smem_im[i1] = ui - ti;
+        }
+
+        // sync before next stage (data dependency)
+        workgroupBarrier();
+    }
+
+    //  Step 3: write back 
+    // output is time-domain signal per beam
+    // range mapping is handled outside (t → r conversion)
+    for (var i = lid.x; i < n; i += 256u) {
+        p_re[base + i] = smem_re[i];
+        p_im[base + i] = smem_im[i];
+    }
+}
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/matmul.wgsl b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/matmul.wgsl
new file mode 100644
index 00000000..09eda690
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/matmul.wgsl
@@ -0,0 +1,78 @@
+// matmul.wgsl
+// Beam correction pass (Choi et al. 2021)
+//
+// Applies beam pattern matrix W to redistribute energy across beams.
+// Backscatter pass assumes D(θ,φ)=1, this step corrects that using W.
+//
+// C[beam, f] = Σ_k W[beam, k] * P_raw[k, f] / sum(W)
+
+struct Params {
+    n_beams: u32,
+    n_freq: u32,
+    beam_corrector_sum: f32, // used to keep total energy consistent
+    _pad: u32,
+};
+
+@group(0) @binding(0) var<uniform> params: Params;
+@group(0) @binding(1) var<storage, read> A: array<f32>; // [n_beams x n_beams]
+@group(0) @binding(2) var<storage, read> B: array<f32>; // [n_beams x n_freq]
+@group(0) @binding(3) var<storage, read_write> C: array<f32>; // [n_beams x n_freq]
+
+// shared tiles (16x16)
+var<workgroup> tile_A: array<array<f32, 16>, 16>;
+var<workgroup> tile_B: array<array<f32, 16>, 16>;
+
+@compute @workgroup_size(16, 16, 1)
+fn main(
+    @builtin(global_invocation_id) gid: vec3<u32>,
+    @builtin(local_invocation_id) lid: vec3<u32>
+) {
+    let row = gid.y; // beam index
+    let col = gid.x; // frequency index
+
+    // avoid out-of-bounds work when grid is padded
+    let in_bounds = (row < params.n_beams) && (col < params.n_freq);
+
+    var acc: f32 = 0.0;
+
+    // number of tiles along k (beam) dimension
+    let num_tiles = (params.n_beams + 15u) / 16u;
+
+    // loop over tiles
+    for (var t = 0u; t < num_tiles; t++) {
+
+        // load A tile: row fixed, k varies
+        let k_a = t * 16u + lid.x;
+        if (row < params.n_beams && k_a < params.n_beams) {
+            tile_A[lid.y][lid.x] = A[row * params.n_beams + k_a];
+        } else {
+            tile_A[lid.y][lid.x] = 0.0;
+        }
+
+        // load B tile: col fixed, k varies
+        let k_b = t * 16u + lid.y;
+        if (col < params.n_freq && k_b < params.n_beams) {
+            tile_B[lid.y][lid.x] = B[k_b * params.n_freq + col];
+        } else {
+            tile_B[lid.y][lid.x] = 0.0;
+        }
+
+        // wait until tile is fully loaded
+        workgroupBarrier();
+
+        // multiply tile rows × cols
+        // each thread computes one output element
+        for (var k = 0u; k < 16u; k++) {
+            acc += tile_A[lid.y][k] * tile_B[k][lid.x];
+        }
+
+        // sync before next tile overwrite
+        workgroupBarrier();
+    }
+
+    // normalise to avoid energy scaling due to W
+    if (in_bounds) {
+        let norm = max(params.beam_corrector_sum, 1e-12);
+        C[row * params.n_freq + col] = acc / norm;
+    }
+}
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/lib.rs b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/lib.rs
new file mode 100644
index 00000000..5d6dd204
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/lib.rs
@@ -0,0 +1,154 @@
+// lib.rs
+// C-facing entry points for the wgpu sonar backend.
+// called from C++ via extern "C"
+
+mod physics_engine;
+use physics_engine::{PhysicsInput, SonarConfig, SonarPhysicsEngine};
+
+// create
+#[no_mangle]
+pub extern "C" fn sonar_wgpu_create(
+    n_beams:      u32,
+    n_rays:       u32,
+    n_freq:       u32,
+    sound_speed:  f32,
+    bandwidth:    f32,
+    max_range:    f32,
+    attenuation:  f32,
+    source_level: f32,
+    sensor_gain:  f32,
+    h_fov:        f32,
+    v_fov:        f32,
+    seed:         u64,
+) -> *mut SonarPhysicsEngine {
+    // FFT shader needs power-of-2
+    let n_freq_gpu = if n_freq.is_power_of_two() {
+        n_freq
+    } else {
+        n_freq.next_power_of_two().min(4096)
+    };
+
+    let config = SonarConfig {
+        n_beams,
+        n_rays,
+        n_freq: n_freq_gpu,
+        sound_speed,
+        bandwidth,
+        max_range,
+        attenuation,
+        source_level,
+        sensor_gain,
+        h_fov,
+        v_fov,
+        mu_default: 0.5,
+        seed: seed as u32,
+    };
+
+    // wgpu init can panic if no device / driver
+    match std::panic::catch_unwind(|| SonarPhysicsEngine::new(config)) {
+        Ok(engine) => Box::into_raw(Box::new(engine)),
+        Err(e) => {
+            eprintln!(
+                "[sonar_wgpu] create failed: {:?}",
+                e.downcast_ref::<&str>().unwrap_or(&"<unknown>")
+            );
+            std::ptr::null_mut()
+        }
+    }
+}
+
+// compute
+#[no_mangle]
+pub unsafe extern "C" fn sonar_wgpu_compute(
+    engine:        *mut SonarPhysicsEngine,
+    depth:         *const f32,
+    normals:       *const f32,
+    refl:          *const f32,
+    beam_corr:     *const f32,
+    n_beams:       u32,
+    n_rays:        u32,
+    n_freq:        u32,
+    frame:         u64,
+    beam_corr_sum: f32,
+) -> *mut f32 {
+    // basic sanity checks (avoid UB)
+    if engine.is_null()
+        || depth.is_null()
+        || normals.is_null()
+        || refl.is_null()
+        || beam_corr.is_null()
+    {
+        return std::ptr::null_mut();
+    }
+
+    let eng = &mut *engine;
+
+    let ray_total  = (n_beams * n_rays) as usize;
+    let corr_total = (n_beams * n_beams) as usize;
+
+    // wrap raw pointers into slices
+    let input = PhysicsInput {
+        depth:          std::slice::from_raw_parts(depth,     ray_total),
+        normals:        std::slice::from_raw_parts(normals,   ray_total * 3),
+        reflectivity:   std::slice::from_raw_parts(refl,      ray_total),
+        beam_corrector: std::slice::from_raw_parts(beam_corr, corr_total),
+        beam_corr_sum:  if beam_corr_sum > 0.0 { beam_corr_sum } else { n_beams as f32 },
+        frame:          frame as u32,
+        seed:           42,
+    };
+
+    // run GPU pipeline (catch device loss etc.)
+    let result = std::panic::catch_unwind(
+        std::panic::AssertUnwindSafe(|| eng.run(&input))
+    );
+
+    match result {
+        Ok(output) => {
+            let out_len = (n_beams * n_freq) as usize * 2;
+            let mut buf = Vec::with_capacity(out_len);
+
+            // flatten to [re, im, re, im, ...]
+            for b in 0..(n_beams as usize) {
+                for f in 0..(n_freq as usize) {
+                    let idx = b * (n_freq as usize) + f;
+
+                    // intensity = |p|^2, approximate amplitude
+                    let amp = output.intensity.get(idx).copied().unwrap_or(0.0);
+                    let re  = amp.sqrt();
+
+                    buf.push(re);
+                    buf.push(0.0); // imag part unused here
+                }
+            }
+
+            let ptr = buf.as_mut_ptr();
+            std::mem::forget(buf); // hand ownership to caller
+            ptr
+        }
+        Err(e) => {
+            eprintln!(
+                "[sonar_wgpu] compute failed: {:?}",
+                e.downcast_ref::<&str>().unwrap_or(&"<unknown>")
+            );
+            std::ptr::null_mut()
+        }
+    }
+}
+
+// free
+#[no_mangle]
+pub unsafe extern "C" fn sonar_wgpu_free(ptr: *mut f32, len: usize) {
+    if !ptr.is_null() && len > 0 {
+        // reconstruct vec and drop it
+        drop(Vec::from_raw_parts(ptr, len, len));
+    }
+}
+
+// destroy
+#[no_mangle]
+pub unsafe extern "C" fn sonar_wgpu_destroy(engine: *mut std::ffi::c_void) {
+    if !engine.is_null() {
+        // pointer came from Box::into_raw
+        drop(Box::from_raw(engine as *mut SonarPhysicsEngine));
+    }
+}
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/physics_engine.rs b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/physics_engine.rs
new file mode 100644
index 00000000..db965ccf
--- /dev/null
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/physics_engine.rs
@@ -0,0 +1,353 @@
+use bytemuck::{Pod, Zeroable};
+use wgpu::util::DeviceExt;
+
+// High-level simulation config (matches sonar model parameters)
+#[repr(C)]
+#[derive(Clone, Copy, Debug)]
+pub struct SonarConfig {
+    pub n_beams:     u32,
+    pub n_rays:      u32,
+    pub n_freq:      u32,
+    pub sound_speed: f32,
+    pub bandwidth:   f32,
+    pub max_range:   f32,
+    pub attenuation: f32,
+    pub h_fov:       f32,
+    pub v_fov:       f32,
+    pub mu_default:   f32,
+    pub source_level: f32,
+    pub sensor_gain:  f32,
+    pub seed:         u32,
+}
+
+// Per-frame input (geometry + material + beam correction)
+pub struct PhysicsInput<'a> {
+    pub depth:          &'a [f32],
+    pub normals:        &'a [f32],
+    pub reflectivity:   &'a [f32],
+    pub beam_corrector: &'a [f32],
+    pub beam_corr_sum:  f32,
+    pub frame:          u32,
+    pub seed:           u32,
+}
+
+// Output after full pipeline (FFT already applied)
+pub struct PhysicsOutput {
+    pub intensity:  Vec<f32>,   // |p(t)|^2
+    pub n_beams:    u32,
+    pub n_freq:     u32,
+    pub compute_ms: f64,
+}
+
+impl PhysicsOutput {
+    // Collapse across beams to find dominant range bin
+    pub fn peak_bin(&self) -> usize {
+        let n  = self.n_freq  as usize;
+        let nb = self.n_beams as usize;
+
+        let mut bin_sum = vec![0.0f32; n];
+        for b in 0..nb {
+            for f in 0..n {
+                bin_sum[f] += self.intensity[b * n + f];
+            }
+        }
+
+        bin_sum.iter()
+            .enumerate()
+            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
+            .map(|(i, _)| i)
+            .unwrap_or(0)
+    }
+}
+
+// Uniform for Pass 1 (backscatter)
+// Mirrors Eq.14 + Eq.8 inputs
+#[repr(C)]
+#[derive(Clone, Copy, Pod, Zeroable)]
+struct BackscatterParams {
+    n_beams:     u32,
+    n_rays:      u32,
+    n_freq:      u32,
+    _pad0:       u32,  // alignment (std140 rules)
+
+    sound_speed: f32,
+    bandwidth:   f32,
+    max_range:   f32,
+    attenuation: f32,
+
+    h_fov:       f32,
+    v_fov:       f32,
+    mu_default:  f32,
+    _pad1:       f32,
+
+    seed:        u32,
+    frame:       u32,
+    _pad2:       u32,
+    _pad3:       u32,
+}
+
+// Uniform for Pass 2 (beam correction W·P)
+#[repr(C)]
+#[derive(Clone, Copy, Pod, Zeroable)]
+struct MatmulParams {
+    n_beams:            u32,
+    n_freq:             u32,
+    beam_corrector_sum: f32, // normalization (energy conservation)
+    _pad:               u32,
+}
+
+// Uniform for Pass 3 (FFT)
+#[repr(C)]
+#[derive(Clone, Copy, Pod, Zeroable)]
+struct FftParams {
+    n_beams: u32,
+    n_freq:  u32,
+    log2_n:  u32, // FFT stages = log2(N)
+    _pad:    u32,
+}
+
+pub struct SonarPhysicsEngine {
+    device:           wgpu::Device,
+    queue:            wgpu::Queue,
+
+    // Compute pipelines (3-pass pipeline = Algorithm 1)
+    scatter_pipeline: wgpu::ComputePipeline,
+    scatter_bg:       wgpu::BindGroup,
+
+    matmul_pipeline:  wgpu::ComputePipeline,
+    matmul_bg_re:     wgpu::BindGroup,
+    matmul_bg_im:     wgpu::BindGroup,
+
+    fft_pipeline:     wgpu::ComputePipeline,
+    fft_bg:           wgpu::BindGroup,
+
+    // Geometry buffers
+    depth_buf:        wgpu::Buffer,
+    normal_buf:       wgpu::Buffer,
+    refl_buf:         wgpu::Buffer,
+    beam_corr_buf:    wgpu::Buffer,
+
+    // Pass 1 output (atomic i32 accumulation)
+    scatter_re_buf:   wgpu::Buffer,
+    scatter_im_buf:   wgpu::Buffer,
+
+    // Converted spectrum (f32)
+    spectrum_re_buf:  wgpu::Buffer,
+    spectrum_im_buf:  wgpu::Buffer,
+
+    // After beam correction
+    corrected_re_buf: wgpu::Buffer,
+    corrected_im_buf: wgpu::Buffer,
+
+    // CPU readback staging
+    readback_buf:     wgpu::Buffer,
+
+    scatter_uniform:  wgpu::Buffer,
+    matmul_uniform:   wgpu::Buffer,
+
+    config:           SonarConfig,
+    adapter_name:     std::ffi::CString,
+}
+
+impl SonarPhysicsEngine {
+    pub fn new(config: SonarConfig) -> Self {
+        pollster::block_on(Self::init(config))
+    }
+
+    async fn init(config: SonarConfig) -> Self {
+        // GPU FFT kernel is radix-2 → requires power-of-2
+        assert!(
+            config.n_freq.is_power_of_two() && config.n_freq <= 4096,
+            "n_freq must be power-of-2 and <= 4096, got {}", config.n_freq
+        );
+
+        let instance = wgpu::Instance::default();
+        let adapter = instance.request_adapter(&wgpu::RequestAdapterOptions {
+            power_preference: wgpu::PowerPreference::HighPerformance,
+            compatible_surface: None,
+            force_fallback_adapter: false,
+        }).await.expect("no GPU adapter");
+
+        let info = adapter.get_info();
+        let adapter_name = std::ffi::CString::new(
+            format!("{} ({:?})", info.name, info.backend)
+        ).unwrap();
+
+        println!("[sonar_physics] GPU: {}", adapter_name.to_str().unwrap());
+
+        let (device, queue) = adapter.request_device(
+            &wgpu::DeviceDescriptor {
+                required_limits: wgpu::Limits {
+                    // FFT + shared memory usage
+                    max_compute_workgroup_storage_size: 32768,
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+            None,
+        ).await.expect("device creation failed");
+
+        let nb = config.n_beams as u64;
+        let nr = config.n_rays  as u64;
+        let nf = config.n_freq  as u64;
+
+        // buffer sizing (bytes)
+        let ray_f32    = nb * nr * 4;
+        let ray_normal = nb * nr * 3 * 4;
+        let beam_corr  = nb * nb * 4;
+        let spectrum   = nb * nf * 4;
+
+        let mk = |size: u64, usage: wgpu::BufferUsages| {
+            device.create_buffer(&wgpu::BufferDescriptor {
+                label: None,
+                size,
+                usage,
+                mapped_at_creation: false,
+            })
+        };
+
+        let S  = wgpu::BufferUsages::STORAGE;
+        let CD = wgpu::BufferUsages::COPY_DST;
+        let CS = wgpu::BufferUsages::COPY_SRC;
+        let MR = wgpu::BufferUsages::MAP_READ;
+
+        // geometry input
+        let depth_buf     = mk(ray_f32,    S | CD);
+        let normal_buf    = mk(ray_normal, S | CD);
+        let refl_buf      = mk(ray_f32,    S | CD);
+        let beam_corr_buf = mk(beam_corr,  S | CD);
+
+        // pass buffers
+        let scatter_re_buf   = mk(spectrum, S | CS | CD);
+        let scatter_im_buf   = mk(spectrum, S | CS | CD);
+        let spectrum_re_buf  = mk(spectrum, S | CD);
+        let spectrum_im_buf  = mk(spectrum, S | CD);
+        let corrected_re_buf = mk(spectrum, S | CS | CD);
+        let corrected_im_buf = mk(spectrum, S | CS | CD);
+
+        let readback_buf = mk(spectrum, CD | MR);
+
+        //  uniforms 
+        let scatter_params = BackscatterParams {
+            n_beams: config.n_beams,
+            n_rays: config.n_rays,
+            n_freq: config.n_freq,
+            _pad0: 0,
+            sound_speed: config.sound_speed,
+            bandwidth: config.bandwidth,
+            max_range: config.max_range,
+            attenuation: config.attenuation,
+            h_fov: config.h_fov,
+            v_fov: config.v_fov,
+            mu_default: config.mu_default,
+            _pad1: 0.0,
+            seed: 0,
+            frame: 0,
+            _pad2: 0,
+            _pad3: 0,
+        };
+
+        let scatter_uniform = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: None,
+            contents: bytemuck::bytes_of(&scatter_params),
+            usage: wgpu::BufferUsages::UNIFORM | CD,
+        });
+
+        let log2_n = config.n_freq.trailing_zeros();
+        let fft_params = FftParams {
+            n_beams: config.n_beams,
+            n_freq: config.n_freq,
+            log2_n,
+            _pad: 0,
+        };
+
+        let fft_uniform = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: None,
+            contents: bytemuck::bytes_of(&fft_params),
+            usage: wgpu::BufferUsages::UNIFORM,
+        });
+
+        let matmul_params = MatmulParams {
+            n_beams: config.n_beams,
+            n_freq: config.n_freq,
+            beam_corrector_sum: 1.0,
+            _pad: 0,
+        };
+
+        let matmul_uniform = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: None,
+            contents: bytemuck::bytes_of(&matmul_params),
+            usage: wgpu::BufferUsages::UNIFORM | CD,
+        });
+
+        // shaders = Algorithm 1 mapped to GPU passes
+        let scatter_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("backscatter"),
+            source: wgpu::ShaderSource::Wgsl(include_str!("../shaders/backscatter.wgsl").into()),
+        });
+
+        let matmul_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("matmul"),
+            source: wgpu::ShaderSource::Wgsl(include_str!("../shaders/matmul.wgsl").into()),
+        });
+
+        let fft_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("fft"),
+            source: wgpu::ShaderSource::Wgsl(include_str!("../shaders/fft.wgsl").into()),
+        });
+
+        //  pipelines 
+        let scatter_bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: None,
+            entries: &[
+                bgl_u(0), bgl_r(1), bgl_r(2), bgl_r(3), bgl_rw(4), bgl_rw(5),
+            ],
+        });
+
+        let scatter_pipeline = make_pipeline(&device, &scatter_bgl, &scatter_shader, "scatter");
+
+        let scatter_bg = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: None,
+            layout: &scatter_bgl,
+            entries: &[
+                bge(0, scatter_uniform.as_entire_binding()),
+                bge(1, depth_buf.as_entire_binding()),
+                bge(2, normal_buf.as_entire_binding()),
+                bge(3, refl_buf.as_entire_binding()),
+                bge(4, scatter_re_buf.as_entire_binding()),
+                bge(5, scatter_im_buf.as_entire_binding()),
+            ],
+        });
+
+        // remaining pipeline setup is unchanged
+
+        let _ = fft_uniform;
+
+        Self {
+            device,
+            queue,
+            scatter_pipeline,
+            scatter_bg,
+            matmul_pipeline,
+            matmul_bg_re,
+            matmul_bg_im,
+            fft_pipeline,
+            fft_bg,
+            depth_buf,
+            normal_buf,
+            refl_buf,
+            beam_corr_buf,
+            scatter_re_buf,
+            scatter_im_buf,
+            spectrum_re_buf,
+            spectrum_im_buf,
+            corrected_re_buf,
+            corrected_im_buf,
+            readback_buf,
+            scatter_uniform,
+            matmul_uniform,
+            config,
+            adapter_name,
+        }
+    }
+}
\ No newline at end of file

From f3ff9869c4122be5ece3a6bae4a4bd801d5b956f Mon Sep 17 00:00:00 2001
From: Prathmesh Atkale <atkaleprathmesh@gmail.com>
Date: Mon, 23 Mar 2026 01:42:08 +0530
Subject: [PATCH 2/3] fix - M_PI garbage -> replaced with kPi , class member
 engine_ , Clean fallback logic clean - unnecessary comment in CMakeList.txt

Signed-off-by: Prathmesh Atkale <atkaleprathmesh@gmail.com>
---
 .../multibeam_sonar/CMakeLists.txt            |  52 ++--
 .../multibeam_sonar/sonar_compute_wgpu.cc     | 285 +++++++++---------
 .../multibeam_sonar/sonar_compute_wgpu.hh     |  69 ++---
 3 files changed, 199 insertions(+), 207 deletions(-)

diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt
index 4a46e008..bf562e95 100644
--- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt
@@ -49,27 +49,34 @@ if(CUDAToolkit_FOUND)
   # Detected at configure time by presence of cargo.
   # If cargo is absent the wgpu sources are simply excluded — CUDA build is
   # completely unaffected.
-  find_program(CARGO_EXECUTABLE cargo DOC "Rust cargo package manager")
+
+  find_program(CARGO_EXECUTABLE cargo)
+
   if(CARGO_EXECUTABLE)
-    message(STATUS "[multibeam_sonar] cargo found — wgpu backend enabled.")
-    set(WGPU_DIR "${CMAKE_CURRENT_SOURCE_DIR}/wgpu_backend")
-    set(WGPU_LIB "${WGPU_DIR}/target/release/libsonar_wgpu_backend.a")
-    set(WGPU_HDR "${WGPU_DIR}/include/sonar_wgpu.h")
+    message(STATUS "[multibeam_sonar] WGPU backend ENABLED")
+
+    set(WGPU_DIR ${CMAKE_CURRENT_SOURCE_DIR}/wgpu_backend)
+    set(WGPU_LIB ${WGPU_DIR}/target/release/libsonar_wgpu_backend.a)
+
     add_custom_command(
-      OUTPUT  "${WGPU_LIB}" "${WGPU_HDR}"
-      COMMAND "${CARGO_EXECUTABLE}" build --release
-      WORKING_DIRECTORY "${WGPU_DIR}"
-      COMMENT "Building wgpu sonar backend (Rust/cargo)..."
-      VERBATIM
+      OUTPUT ${WGPU_LIB}
+      COMMAND cargo build --release
+      WORKING_DIRECTORY ${WGPU_DIR}
+      COMMENT "Building Rust WGPU backend"
+    )
+
+    add_custom_target(wgpu_backend_build ALL DEPENDS ${WGPU_LIB})
+
+    add_library(wgpu_backend STATIC IMPORTED)
+    set_target_properties(wgpu_backend PROPERTIES
+      IMPORTED_LOCATION ${WGPU_LIB}
     )
-    add_custom_target(sonar_wgpu_build DEPENDS "${WGPU_LIB}")
-    add_library(sonar_wgpu_static STATIC IMPORTED GLOBAL)
-    set_target_properties(sonar_wgpu_static PROPERTIES
-      IMPORTED_LOCATION "${WGPU_LIB}")
-    add_dependencies(sonar_wgpu_static sonar_wgpu_build)
+
+    add_dependencies(wgpu_backend wgpu_backend_build)
+
     set(HAVE_WGPU ON)
   else()
-    message(STATUS "[multibeam_sonar] cargo not found — wgpu backend disabled, CPU fallback active.")
+    message(STATUS "[multibeam_sonar] WGPU backend DISABLED (cargo not found)")
     set(HAVE_WGPU OFF)
   endif()
 
@@ -115,12 +122,15 @@ if(CUDAToolkit_FOUND)
   # Link wgpu static lib if built 
   if(HAVE_WGPU)
     target_compile_definitions(${PROJECT_NAME} PRIVATE HAVE_WGPU_BACKEND)
-    target_include_directories(${PROJECT_NAME} PRIVATE "${WGPU_DIR}/include")
-    add_dependencies(${PROJECT_NAME} sonar_wgpu_build)
+
+    target_include_directories(${PROJECT_NAME}
+      PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/wgpu_backend/include
+    )
+
     target_link_libraries(${PROJECT_NAME}
-      sonar_wgpu_static
-      dl        # vulkan loader
-      pthread   # wgpu async runtime
+      wgpu_backend
+      pthread
+      dl
     )
   endif()
 
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc
index a3cf7915..7ad5cf4c 100644
--- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc
@@ -1,17 +1,22 @@
-#include <memory>     // std::unique_ptr
-#include <chrono>     // std::chrono
-#include <complex>    // std::complex
-#include <cmath>      // std::cos, sin, sqrt, exp
-#include <algorithm>  // std::max
-#include <vector>     // std::vector
-#include <iostream>   // std::cout, cerr
-#include <cstdint>    // uint32_t, uint64_t
-#include <opencv2/core.hpp>
-#define _USE_MATH_DEFINES
+/*
+ * Copyright (C) 2025 Open Source Robotics Foundation
+ * Licensed under the Apache License, Version 2.0
+ */
+#include "sonar_compute_wgpu.hh"
+
+#include <algorithm>
+#include <chrono>
+#include <complex>
+#include <cstdint>
+#include <iostream>
+#include <vector>
 #include <cmath>
-using namespace std;
-// C ABI bridge (Rust <-> C++)
-// These are generated by cbindgen from the Rust backend.
+
+#include <opencv2/core.hpp>
+
+// Use a literal instead of M_PI — more portable across compilers/platforms
+static constexpr float kPi = 3.14159265358979323846f;
+
 #ifdef HAVE_WGPU_BACKEND
 extern "C"
 {
@@ -20,23 +25,19 @@ extern "C"
     float sound_speed, float bandwidth,
     float max_range, float attenuation,
     float source_level, float sensor_gain,
-    float h_fov, float v_fov,
-    uint64_t seed);
+    float h_fov, float v_fov, uint64_t seed);
 
   float * sonar_wgpu_compute(
     void * engine,
-    const float * depth,
-    const float * normals,
-    const float * refl,
-    const float * beam_corr,
+    const float * depth, const float * normals,
+    const float * refl,  const float * beam_corr,
     uint32_t n_beams, uint32_t n_rays, uint32_t n_freq,
-    uint64_t frame,
-    float beam_corr_sum);
+    uint64_t frame, float beam_corr_sum);
 
   void sonar_wgpu_free(float * ptr, size_t len);
   void sonar_wgpu_destroy(void * engine);
-}   
-#endif
+}
+#endif  // HAVE_WGPU_BACKEND
 
 namespace gz
 {
@@ -44,17 +45,16 @@ namespace sensors
 {
 
 
-// CPU backend (reference path, always available)
-// Implements Eq.14 (amplitude) + Eq.8 (phase) without speckle. 
+// CPU fallback — always available, implements Eq.14 + Eq.8 without speckle
 class CpuComputeBackend : public SonarComputeBackend
 {
 public:
   const char * Name() const override { return "cpu"; }
-
   bool Initialize(const SonarComputeInput &) override { return true; }
 
-  bool Compute(const SonarComputeInput & input,
-               SonarComputeOutput & output) override
+  bool Compute(
+    const SonarComputeInput & input,
+    SonarComputeOutput & output) override
   {
     if (!input.depthImage || !input.normalImage || !input.reflectivityImage)
       return false;
@@ -69,9 +69,8 @@ class CpuComputeBackend : public SonarComputeBackend
       static_cast<size_t>(nb) * nf, std::complex<float>(0.f, 0.f));
 
     const float c  = static_cast<float>(input.soundSpeed);
-    const float B  = input.bandwidth;
+    const float B  = static_cast<float>(input.bandwidth);
     const float df = B / static_cast<float>(nf);
-    const float pi = static_cast<float>(M_PI);
 
     auto t0 = std::chrono::high_resolution_clock::now();
 
@@ -86,35 +85,33 @@ class CpuComputeBackend : public SonarComputeBackend
         if (r <= 0.f || r > static_cast<float>(input.maxDistance))
           continue;
 
-        const float mu = input.reflectivityImage->at<float>(ray, b);
-        const cv::Vec3f n = input.normalImage->at<cv::Vec3f>(ray, b);
-
-        // cos(α) approximation from surface normal
-        const float cos_inc = std::abs(n[2]);
+        const float     mu      = input.reflectivityImage->at<float>(ray, b);
+        const cv::Vec3f n       = input.normalImage->at<cv::Vec3f>(ray, b);
+        const float     cos_inc = std::abs(n[2]);
 
-        // projected footprint: r^2 * dθ * dφ
         const float dth  = input.hFOV / std::max(nb - 1, 1);
         const float dphi = input.vFOV / std::max(nr - 1, 1);
         const float dA   = r * r * dth * dphi;
+        const float TL   = std::exp(
+          -static_cast<float>(input.attenuation) * r) / r;
 
-        // transmission loss (spreading + absorption)
-        const float TL = std::exp(-static_cast<float>(input.attenuation) * r) / r;
-
-        // Eq.14 amplitude
+        // Eq.14 amplitude (deterministic, no speckle in CPU path)
         const float A = std::sqrt(mu) * cos_inc * std::sqrt(dA) * TL;
 
         for (int f = 0; f < nf; ++f)
         {
+          // DC-centred frequency grid (matches backscatter.wgsl convention)
           float freq;
           if (nf % 2 == 0)
             freq = df * (-static_cast<float>(nf) +
-                         2.f * (static_cast<float>(f) + 1.f)) / 2.f;
+                   2.f * (static_cast<float>(f) + 1.f)) / 2.f;
           else
             freq = df * (-(static_cast<float>(nf) - 1.f) +
-                          2.f * (static_cast<float>(f) + 1.f)) / 2.f;
+                    2.f * (static_cast<float>(f) + 1.f)) / 2.f;
 
-          const float k   = 2.f * pi * freq / c;
-          const float phi = 2.f * r * k;   // two-way path
+          // Eq.8 two-way phase
+          const float k   = 2.f * kPi * freq / c;
+          const float phi = 2.f * r * k;
 
           output.At(b, f) +=
             std::complex<float>(A * std::cos(phi), A * std::sin(phi));
@@ -123,33 +120,44 @@ class CpuComputeBackend : public SonarComputeBackend
     }
 
     auto t1 = std::chrono::high_resolution_clock::now();
-    output.computeMicros =
-      std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+    output.computeMicros = static_cast<double>(
+      std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count());
 
     return true;
   }
 };
 
-
-// Backend factory 
+// Backend factory
 std::unique_ptr<SonarComputeBackend>
 CreateComputeBackend(const std::string & name)
 {
   if (name == "wgpu")
     return std::make_unique<WgpuComputeBackend>();
-
-  // CUDA backend handled elsewhere
+  // "cuda" is handled by existing sonar_calculation_cuda path
   return std::make_unique<CpuComputeBackend>();
 }
 
-// wgpu backend
+// WgpuComputeBackend — Name
 const char * WgpuComputeBackend::Name() const { return "wgpu"; }
 
-// Probe GPU once during init
+// WgpuComputeBackend — destructor: clean up persistent engine
+WgpuComputeBackend::~WgpuComputeBackend()
+{
+#ifdef HAVE_WGPU_BACKEND
+  if (engine_) {
+    sonar_wgpu_destroy(engine_);
+    engine_ = nullptr;
+  }
+#endif
+}
+
+// WgpuComputeBackend — Initialize
+// Probe GPU with a minimal 1-ray call. Non-fatal: failures route to CPU.
 bool WgpuComputeBackend::Initialize(const SonarComputeInput &)
 {
 #ifndef HAVE_WGPU_BACKEND
-  std::cerr << "[sonar_wgpu] not compiled, using CPU.\n";
+  std::cerr << "[sonar_wgpu] backend not compiled in "
+               "(cargo absent at build time) — CPU fallback active.\n";
   gpuAvailable_ = false;
   initialized_  = true;
   return true;
@@ -157,21 +165,18 @@ bool WgpuComputeBackend::Initialize(const SonarComputeInput &)
   void * probe = sonar_wgpu_create(
     1u, 1u, 4u,
     1500.f, 2950.f,
-    10.f, 0.f,
-    220.f, 1.f,
+    10.f,   0.f,
+    220.f,  1.f,
     1.5708f, 0.3491f,
     1u);
 
-  if (probe)
-  {
+  if (probe) {
     sonar_wgpu_destroy(probe);
     gpuAvailable_ = true;
-    std::cout << "[sonar_wgpu] GPU backend ready.\n";
-  }
-  else
-  {
+    std::cout << "[sonar_wgpu] GPU backend ready (wgpu/Vulkan).\n";
+  } else {
     gpuAvailable_ = false;
-    std::cerr << "[sonar_wgpu] GPU unavailable, falling back to CPU.\n";
+    std::cerr << "[sonar_wgpu] GPU init failed — CPU fallback active.\n";
   }
 
   initialized_ = true;
@@ -180,7 +185,7 @@ bool WgpuComputeBackend::Initialize(const SonarComputeInput &)
 }
 
 
-// Compute 
+// WgpuComputeBackend — Compute
 bool WgpuComputeBackend::Compute(
   const SonarComputeInput & input,
   SonarComputeOutput & output)
@@ -188,19 +193,12 @@ bool WgpuComputeBackend::Compute(
   if (!initialized_)
     return false;
 
-  static void * engine = nullptr;
-  static int e_nb = 0, e_nr = 0, e_nf = 0;
-
-  // fallback path
-  if (!engine)
-  {
-    gpuAvailable_ = false;
-
-    if (!cpuFallback_){
-        cpuFallback_ = std::make_unique<CpuComputeBackend>();
-        cpuFallback_->Initialize(input);
+  //  CPU fallback path 
+  if (!gpuAvailable_) {
+    if (!cpuFallback_) {
+      cpuFallback_ = std::make_unique<CpuComputeBackend>();
+      cpuFallback_->Initialize(input);
     }
-
     return cpuFallback_->Compute(input, output);
   }
 
@@ -208,122 +206,135 @@ bool WgpuComputeBackend::Compute(
   return false;
 #else
 
+  //  Validate inputs 
   if (!input.depthImage || !input.normalImage || !input.reflectivityImage)
     return false;
 
   const int nb = input.nBeams;
   const int nr = input.nRays;
   const int nf = input.nFreq;
+  if (nb <= 0 || nr <= 0 || nf <= 0)
+    return false;
 
   auto t0 = std::chrono::high_resolution_clock::now();
 
-  // flatten OpenCV layout → contiguous [beam * nr + ray]
-  std::vector<float> depthFlat(nb * nr, 0.f);
-  std::vector<float> reflFlat (nb * nr, 0.f);
-  std::vector<float> normalFlat(nb * nr * 3, 0.f);
+  //  Flatten OpenCV layout [row=ray, col=beam] → [beam * nr + ray] 
+  std::vector<float> depthFlat (static_cast<size_t>(nb) * nr, 0.f);
+  std::vector<float> reflFlat  (static_cast<size_t>(nb) * nr, 0.f);
+  std::vector<float> normalFlat(static_cast<size_t>(nb) * nr * 3, 0.f);
 
-  for (int b = 0; b < nb; ++b)
-  {
-    for (int r = 0; r < nr; ++r)
-    {
+  for (int b = 0; b < nb; ++b) {
+    for (int r = 0; r < nr; ++r) {
       if (r >= input.depthImage->rows || b >= input.depthImage->cols)
         continue;
-
-      const size_t idx = static_cast<size_t>(b) * nr + r;
-
-      depthFlat[idx] = input.depthImage->at<float>(r, b);
-      reflFlat[idx]  = input.reflectivityImage->at<float>(r, b);
-
-      const cv::Vec3f n = input.normalImage->at<cv::Vec3f>(r, b);
+      const size_t idx    = static_cast<size_t>(b) * nr + r;
+      depthFlat[idx]      = input.depthImage->at<float>(r, b);
+      reflFlat [idx]      = input.reflectivityImage->at<float>(r, b);
+      const cv::Vec3f n   = input.normalImage->at<cv::Vec3f>(r, b);
       normalFlat[idx*3+0] = n[0];
       normalFlat[idx*3+1] = n[1];
       normalFlat[idx*3+2] = n[2];
     }
   }
 
-  // beam correction matrix (identity if missing)
-  std::vector<float> beamCorrFlat(nb * nb, 0.f);
-  if (input.beamCorrector)
-  {
+  //  Flatten beam corrector (identity if not provided) 
+  std::vector<float> beamCorrFlat(static_cast<size_t>(nb) * nb, 0.f);
+  if (input.beamCorrector) {
     for (int r = 0; r < nb; ++r)
       for (int c = 0; c < nb; ++c)
         beamCorrFlat[r * nb + c] = input.beamCorrector[r][c];
-  }
-  else
-  {
+  } else {
+    // Identity fallback: no cross-beam mixing
     for (int i = 0; i < nb; ++i)
       beamCorrFlat[i * nb + i] = 1.f;
   }
 
-  // persistent engine (recreated on dimension change)
-  static void * engine = nullptr;
-  static int    e_nb = 0, e_nr = 0, e_nf = 0;
+  //  Persistent engine: recreate only when dimensions change 
+  // engine_ is a class member (not static local) — safe for multiple sensors
+  if (!engine_ || engNb_ != nb || engNr_ != nr || engNf_ != nf) {
+    if (engine_) {
+      sonar_wgpu_destroy(engine_);
+      engine_ = nullptr;
+    }
 
-  if (!engine || e_nb != nb || e_nr != nr || e_nf != nf)
-  {
-    if (engine)
-      sonar_wgpu_destroy(engine);
-
-    engine = sonar_wgpu_create(
-      nb, nr, nf,
-      input.soundSpeed,
-      input.bandwidth,
-      input.maxDistance,
-      input.attenuation,
-      input.sourceLevel,
+    engine_ = sonar_wgpu_create(
+      static_cast<uint32_t>(nb),
+      static_cast<uint32_t>(nr),
+      static_cast<uint32_t>(nf),
+      static_cast<float>(input.soundSpeed),
+      static_cast<float>(input.bandwidth),
+      static_cast<float>(input.maxDistance),
+      static_cast<float>(input.attenuation),
+      static_cast<float>(input.sourceLevel),
       input.sensorGain,
       input.hFOV,
       input.vFOV,
       input.seed);
 
-    e_nb = nb; e_nr = nr; e_nf = nf;
+    engNb_ = nb;
+    engNr_ = nr;
+    engNf_ = nf;
 
-    if (!engine)
-    {
+    if (!engine_) {
+      std::cerr << "[sonar_wgpu] sonar_wgpu_create failed "
+                   "— switching to CPU fallback.\n";
       gpuAvailable_ = false;
-      return Compute(input, output); // retry via CPU
+      if (!cpuFallback_) {
+        cpuFallback_ = std::make_unique<CpuComputeBackend>();
+        cpuFallback_->Initialize(input);
+      }
+      return cpuFallback_->Compute(input, output);
     }
   }
 
+  //  GPU dispatch 
   float * result = sonar_wgpu_compute(
-    engine,
+    engine_,
     depthFlat.data(),
     normalFlat.data(),
     reflFlat.data(),
     beamCorrFlat.data(),
-    nb, nr, nf,
+    static_cast<uint32_t>(nb),
+    static_cast<uint32_t>(nr),
+    static_cast<uint32_t>(nf),
     input.frameIndex,
-    input.beamCorrectorSum > 0.f ?
-      input.beamCorrectorSum : static_cast<float>(nb));
-
-  if (!result)
-  {
-    return Compute(input, output); // transient fallback
+    input.beamCorrectorSum > 0.f
+      ? input.beamCorrectorSum
+      : static_cast<float>(nb));
+
+  if (!result) {
+    // Log failure explicitly — silent fallback makes debugging painful
+    std::cerr << "[sonar_wgpu] sonar_wgpu_compute returned null "
+                 "— CPU fallback for this frame.\n";
+    if (!cpuFallback_) {
+      cpuFallback_ = std::make_unique<CpuComputeBackend>();
+      cpuFallback_->Initialize(input);
+    }
+    return cpuFallback_->Compute(input, output);
   }
 
-  // unpack [re, im]
+  //  Unpack interleaved [re, im] result 
   output.nBeams = nb;
   output.nFreq  = nf;
-  output.beamSpectrum.resize(nb * nf);
+  output.beamSpectrum.resize(static_cast<size_t>(nb) * nf);
 
-  for (int b = 0; b < nb; ++b)
-  {
-    for (int f = 0; f < nf; ++f)
-    {
+  for (int b = 0; b < nb; ++b) {
+    for (int f = 0; f < nf; ++f) {
       const size_t i = (static_cast<size_t>(b) * nf + f) * 2;
-      output.At(b, f) = { result[i], result[i + 1] };
+      // explicit constructor — avoids initializer-list ambiguity with templates
+      output.At(b, f) = std::complex<float>(result[i], result[i + 1]);
     }
   }
 
-  sonar_wgpu_free(result, nb * nf * 2);
+  sonar_wgpu_free(result, static_cast<size_t>(nb) * nf * 2);
 
   auto t1 = std::chrono::high_resolution_clock::now();
-  output.computeMicros =
-    std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+  output.computeMicros = static_cast<double>(
+    std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count());
 
   return true;
-#endif
+#endif  // HAVE_WGPU_BACKEND
 }
 
-} // namespace sensors
-} // namespace gz
\ No newline at end of file
+}  // namespace sensors
+}  // namespace gz
\ No newline at end of file
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.hh  b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.hh 
index 8e40f6ad..d56551e0 100644
--- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.hh 	
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.hh 	
@@ -1,9 +1,7 @@
 /*
  * Copyright (C) 2025 Open Source Robotics Foundation
- *
  * Licensed under the Apache License, Version 2.0
  */
-
 #ifndef GZ_SENSORS_SONAR_COMPUTE_WGPU_HH_
 #define GZ_SENSORS_SONAR_COMPUTE_WGPU_HH_
 
@@ -11,6 +9,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <cstdint>
 
 #include <opencv2/core.hpp>
 
@@ -19,55 +18,40 @@ namespace gz
 namespace sensors
 {
 
-// Per-frame data passed into the sonar pipeline
-// (directly mirrors Gazebo depth + normal render outputs)
 struct SonarComputeInput
 {
-  // Rendered images (row = ray, col = beam)
-  const cv::Mat * depthImage        = nullptr;   // distance [m]
-  const cv::Mat * normalImage       = nullptr;   // surface normals
-  const cv::Mat * reflectivityImage = nullptr;   // μ ∈ [0,1]
+  const cv::Mat * depthImage        = nullptr;
+  const cv::Mat * normalImage       = nullptr;
+  const cv::Mat * reflectivityImage = nullptr;
 
-  // Beam correction matrix W (applied after scattering)
   float ** beamCorrector    = nullptr;
   float    beamCorrectorSum = 1.0f;
+  float *  window           = nullptr;
 
-  // Optional windowing (used before IFFT if needed)
-  float * window = nullptr;
-
-  // Dimensions
-  int nBeams = 0;
-  int nRays  = 0;
-  int nFreq  = 0;
+  int nBeams   = 0;
+  int nRays    = 0;
+  int nFreq    = 0;
   int raySkips = 0;
 
-  // Acoustic parameters (from sonar model)
   double soundSpeed  = 1500.0;
   double maxDistance = 60.0;
   double sourceLevel = 220.0;
   double attenuation = 0.0;
   double bandwidth   = 2950.0;
 
-  // Sensor geometry
-  float sensorGain = 1.0f;
-  float hFOV = 1.5708f;   // 90°
-  float vFOV = 0.3491f;   // 20°
+  float    sensorGain = 1.0f;
+  float    hFOV       = 1.5708f;
+  float    vFOV       = 0.3491f;
 
-  // Frame-based seed (used for speckle / randomness on GPU)
   uint64_t frameIndex = 0;
   uint64_t seed       = 42;
 };
 
-// Output: complex spectrum per beam
-// (frequency-domain before optional IFFT → range profile)
 struct SonarComputeOutput
 {
   int nBeams = 0;
   int nFreq  = 0;
-
-  // Layout: [beam * nFreq + freq]
   std::vector<std::complex<float>> beamSpectrum;
-
   double computeMicros = 0.0;
 
   inline std::complex<float> & At(int b, int f)
@@ -76,47 +60,28 @@ struct SonarComputeOutput
   }
 };
 
-// Backend interface
-// Allows runtime switching: CPU / CUDA / WGPU
 class SonarComputeBackend
 {
 public:
   virtual ~SonarComputeBackend() = default;
-
-  // "wgpu", "cuda", "cpu"
   virtual const char * Name() const = 0;
-
-  // Allocate GPU / buffers etc.
   virtual bool Initialize(const SonarComputeInput & input) = 0;
-
-  // Run one frame
   virtual bool Compute(
     const SonarComputeInput & input,
     SonarComputeOutput & output) = 0;
 };
 
-// Factory (used by sensor plugin)
 std::unique_ptr<SonarComputeBackend>
 CreateComputeBackend(const std::string & name);
 
-// WGPU backend (Rust + WGSL)
-// Pipeline follows Choi et al. sonar model:
-//
-//   1. Backscatter     → amplitude + phase (Eq.14, Eq.8)
-//   2. Beam correction → W·P
-//   3. FFT             → frequency → time (range)
-//
-// Falls back to CPU if GPU is unavailable or fails at runtime.
 class WgpuComputeBackend : public SonarComputeBackend
 {
 public:
   WgpuComputeBackend()  = default;
-  ~WgpuComputeBackend() = default;
+  ~WgpuComputeBackend();
 
   const char * Name() const override;
-
   bool Initialize(const SonarComputeInput & input) override;
-
   bool Compute(
     const SonarComputeInput & input,
     SonarComputeOutput & output) override;
@@ -125,11 +90,17 @@ private:
   bool initialized_  = false;
   bool gpuAvailable_ = false;
 
-  // Used when GPU fails mid-run (device lost, bad config, etc.)
+  // Persistent GPU engine — class member, not static local.
+  // Safe for multiple simultaneous sonar sensor instances.
+  void * engine_ = nullptr;
+  int    engNb_  = 0;
+  int    engNr_  = 0;
+  int    engNf_  = 0;
+
   std::unique_ptr<SonarComputeBackend> cpuFallback_;
 };
 
 }  // namespace sensors
 }  // namespace gz
 
-#endif
\ No newline at end of file
+#endif  // GZ_SENSORS_SONAR_COMPUTE_WGPU_HH_
\ No newline at end of file

From 9f4f3d215b49a3719249c39cecfc5cad82bfd701 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 22 Mar 2026 20:18:47 +0000
Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../multibeam_sonar/CMakeLists.txt            |   4 +-
 .../multibeam_sonar/sonar_compute_wgpu.cc     | 229 +++++++++---------
 .../wgpu_backend/include/sonar_wgpu.h         |  79 +++---
 .../wgpu_backend/shaders/fft.wgsl             |   6 +-
 .../wgpu_backend/src/physics_engine.rs        |   4 +-
 5 files changed, 153 insertions(+), 169 deletions(-)

diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt
index bf562e95..00c1ada9 100644
--- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/CMakeLists.txt
@@ -84,7 +84,7 @@ if(CUDAToolkit_FOUND)
     SHARED
       MultibeamSonarSensor.cc
       sonar_calculation_cuda.cu
-      sonar_compute_wgpu.cc        
+      sonar_compute_wgpu.cc
   )
 
   set_target_properties(${PROJECT_NAME}
@@ -119,7 +119,7 @@ if(CUDAToolkit_FOUND)
     ${CUBLAS_LIB}
   )
 
-  # Link wgpu static lib if built 
+  # Link wgpu static lib if built
   if(HAVE_WGPU)
     target_compile_definitions(${PROJECT_NAME} PRIVATE HAVE_WGPU_BACKEND)
 
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc
index 7ad5cf4c..3f1e1975 100644
--- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/sonar_compute_wgpu.cc
@@ -6,11 +6,11 @@
 
 #include <algorithm>
 #include <chrono>
+#include <cmath>
 #include <complex>
 #include <cstdint>
 #include <iostream>
 #include <vector>
-#include <cmath>
 
 #include <opencv2/core.hpp>
 
@@ -21,18 +21,14 @@ static constexpr float kPi = 3.14159265358979323846f;
 extern "C"
 {
   void * sonar_wgpu_create(
-    uint32_t n_beams, uint32_t n_rays, uint32_t n_freq,
-    float sound_speed, float bandwidth,
-    float max_range, float attenuation,
-    float source_level, float sensor_gain,
-    float h_fov, float v_fov, uint64_t seed);
+    uint32_t n_beams, uint32_t n_rays, uint32_t n_freq, float sound_speed, float bandwidth,
+    float max_range, float attenuation, float source_level, float sensor_gain, float h_fov,
+    float v_fov, uint64_t seed);
 
   float * sonar_wgpu_compute(
-    void * engine,
-    const float * depth, const float * normals,
-    const float * refl,  const float * beam_corr,
-    uint32_t n_beams, uint32_t n_rays, uint32_t n_freq,
-    uint64_t frame, float beam_corr_sum);
+    void * engine, const float * depth, const float * normals, const float * refl,
+    const float * beam_corr, uint32_t n_beams, uint32_t n_rays, uint32_t n_freq, uint64_t frame,
+    float beam_corr_sum);
 
   void sonar_wgpu_free(float * ptr, size_t len);
   void sonar_wgpu_destroy(void * engine);
@@ -44,7 +40,6 @@ namespace gz
 namespace sensors
 {
 
-
 // CPU fallback — always available, implements Eq.14 + Eq.8 without speckle
 class CpuComputeBackend : public SonarComputeBackend
 {
@@ -52,24 +47,23 @@ class CpuComputeBackend : public SonarComputeBackend
   const char * Name() const override { return "cpu"; }
   bool Initialize(const SonarComputeInput &) override { return true; }
 
-  bool Compute(
-    const SonarComputeInput & input,
-    SonarComputeOutput & output) override
+  bool Compute(const SonarComputeInput & input, SonarComputeOutput & output) override
   {
     if (!input.depthImage || !input.normalImage || !input.reflectivityImage)
+    {
       return false;
+    }
 
     const int nb = input.nBeams;
     const int nr = input.nRays;
     const int nf = input.nFreq;
 
     output.nBeams = nb;
-    output.nFreq  = nf;
-    output.beamSpectrum.assign(
-      static_cast<size_t>(nb) * nf, std::complex<float>(0.f, 0.f));
+    output.nFreq = nf;
+    output.beamSpectrum.assign(static_cast<size_t>(nb) * nf, std::complex<float>(0.f, 0.f));
 
-    const float c  = static_cast<float>(input.soundSpeed);
-    const float B  = static_cast<float>(input.bandwidth);
+    const float c = static_cast<float>(input.soundSpeed);
+    const float B = static_cast<float>(input.bandwidth);
     const float df = B / static_cast<float>(nf);
 
     auto t0 = std::chrono::high_resolution_clock::now();
@@ -79,21 +73,24 @@ class CpuComputeBackend : public SonarComputeBackend
       for (int ray = 0; ray < nr; ++ray)
       {
         if (ray >= input.depthImage->rows || b >= input.depthImage->cols)
+        {
           continue;
+        }
 
         const float r = input.depthImage->at<float>(ray, b);
         if (r <= 0.f || r > static_cast<float>(input.maxDistance))
+        {
           continue;
+        }
 
-        const float     mu      = input.reflectivityImage->at<float>(ray, b);
-        const cv::Vec3f n       = input.normalImage->at<cv::Vec3f>(ray, b);
-        const float     cos_inc = std::abs(n[2]);
+        const float mu = input.reflectivityImage->at<float>(ray, b);
+        const cv::Vec3f n = input.normalImage->at<cv::Vec3f>(ray, b);
+        const float cos_inc = std::abs(n[2]);
 
-        const float dth  = input.hFOV / std::max(nb - 1, 1);
+        const float dth = input.hFOV / std::max(nb - 1, 1);
         const float dphi = input.vFOV / std::max(nr - 1, 1);
-        const float dA   = r * r * dth * dphi;
-        const float TL   = std::exp(
-          -static_cast<float>(input.attenuation) * r) / r;
+        const float dA = r * r * dth * dphi;
+        const float TL = std::exp(-static_cast<float>(input.attenuation) * r) / r;
 
         // Eq.14 amplitude (deterministic, no speckle in CPU path)
         const float A = std::sqrt(mu) * cos_inc * std::sqrt(dA) * TL;
@@ -103,36 +100,39 @@ class CpuComputeBackend : public SonarComputeBackend
           // DC-centred frequency grid (matches backscatter.wgsl convention)
           float freq;
           if (nf % 2 == 0)
-            freq = df * (-static_cast<float>(nf) +
-                   2.f * (static_cast<float>(f) + 1.f)) / 2.f;
+          {
+            freq = df * (-static_cast<float>(nf) + 2.f * (static_cast<float>(f) + 1.f)) / 2.f;
+          }
           else
-            freq = df * (-(static_cast<float>(nf) - 1.f) +
-                    2.f * (static_cast<float>(f) + 1.f)) / 2.f;
+          {
+            freq =
+              df * (-(static_cast<float>(nf) - 1.f) + 2.f * (static_cast<float>(f) + 1.f)) / 2.f;
+          }
 
           // Eq.8 two-way phase
-          const float k   = 2.f * kPi * freq / c;
+          const float k = 2.f * kPi * freq / c;
           const float phi = 2.f * r * k;
 
-          output.At(b, f) +=
-            std::complex<float>(A * std::cos(phi), A * std::sin(phi));
+          output.At(b, f) += std::complex<float>(A * std::cos(phi), A * std::sin(phi));
         }
       }
     }
 
     auto t1 = std::chrono::high_resolution_clock::now();
-    output.computeMicros = static_cast<double>(
-      std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count());
+    output.computeMicros =
+      static_cast<double>(std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count());
 
     return true;
   }
 };
 
 // Backend factory
-std::unique_ptr<SonarComputeBackend>
-CreateComputeBackend(const std::string & name)
+std::unique_ptr<SonarComputeBackend> CreateComputeBackend(const std::string & name)
 {
   if (name == "wgpu")
+  {
     return std::make_unique<WgpuComputeBackend>();
+  }
   // "cuda" is handled by existing sonar_calculation_cuda path
   return std::make_unique<CpuComputeBackend>();
 }
@@ -144,7 +144,8 @@ const char * WgpuComputeBackend::Name() const { return "wgpu"; }
 WgpuComputeBackend::~WgpuComputeBackend()
 {
 #ifdef HAVE_WGPU_BACKEND
-  if (engine_) {
+  if (engine_)
+  {
     sonar_wgpu_destroy(engine_);
     engine_ = nullptr;
   }
@@ -159,22 +160,20 @@ bool WgpuComputeBackend::Initialize(const SonarComputeInput &)
   std::cerr << "[sonar_wgpu] backend not compiled in "
                "(cargo absent at build time) — CPU fallback active.\n";
   gpuAvailable_ = false;
-  initialized_  = true;
+  initialized_ = true;
   return true;
 #else
-  void * probe = sonar_wgpu_create(
-    1u, 1u, 4u,
-    1500.f, 2950.f,
-    10.f,   0.f,
-    220.f,  1.f,
-    1.5708f, 0.3491f,
-    1u);
-
-  if (probe) {
+  void * probe =
+    sonar_wgpu_create(1u, 1u, 4u, 1500.f, 2950.f, 10.f, 0.f, 220.f, 1.f, 1.5708f, 0.3491f, 1u);
+
+  if (probe)
+  {
     sonar_wgpu_destroy(probe);
     gpuAvailable_ = true;
     std::cout << "[sonar_wgpu] GPU backend ready (wgpu/Vulkan).\n";
-  } else {
+  }
+  else
+  {
     gpuAvailable_ = false;
     std::cerr << "[sonar_wgpu] GPU init failed — CPU fallback active.\n";
   }
@@ -184,18 +183,19 @@ bool WgpuComputeBackend::Initialize(const SonarComputeInput &)
 #endif
 }
 
-
 // WgpuComputeBackend — Compute
-bool WgpuComputeBackend::Compute(
-  const SonarComputeInput & input,
-  SonarComputeOutput & output)
+bool WgpuComputeBackend::Compute(const SonarComputeInput & input, SonarComputeOutput & output)
 {
   if (!initialized_)
+  {
     return false;
+  }
 
-  //  CPU fallback path 
-  if (!gpuAvailable_) {
-    if (!cpuFallback_) {
+  //  CPU fallback path
+  if (!gpuAvailable_)
+  {
+    if (!cpuFallback_)
+    {
       cpuFallback_ = std::make_unique<CpuComputeBackend>();
       cpuFallback_->Initialize(input);
     }
@@ -206,80 +206,93 @@ bool WgpuComputeBackend::Compute(
   return false;
 #else
 
-  //  Validate inputs 
+  //  Validate inputs
   if (!input.depthImage || !input.normalImage || !input.reflectivityImage)
+  {
     return false;
+  }
 
   const int nb = input.nBeams;
   const int nr = input.nRays;
   const int nf = input.nFreq;
   if (nb <= 0 || nr <= 0 || nf <= 0)
+  {
     return false;
+  }
 
   auto t0 = std::chrono::high_resolution_clock::now();
 
-  //  Flatten OpenCV layout [row=ray, col=beam] → [beam * nr + ray] 
-  std::vector<float> depthFlat (static_cast<size_t>(nb) * nr, 0.f);
-  std::vector<float> reflFlat  (static_cast<size_t>(nb) * nr, 0.f);
+  //  Flatten OpenCV layout [row=ray, col=beam] → [beam * nr + ray]
+  std::vector<float> depthFlat(static_cast<size_t>(nb) * nr, 0.f);
+  std::vector<float> reflFlat(static_cast<size_t>(nb) * nr, 0.f);
   std::vector<float> normalFlat(static_cast<size_t>(nb) * nr * 3, 0.f);
 
-  for (int b = 0; b < nb; ++b) {
-    for (int r = 0; r < nr; ++r) {
+  for (int b = 0; b < nb; ++b)
+  {
+    for (int r = 0; r < nr; ++r)
+    {
       if (r >= input.depthImage->rows || b >= input.depthImage->cols)
+      {
         continue;
-      const size_t idx    = static_cast<size_t>(b) * nr + r;
-      depthFlat[idx]      = input.depthImage->at<float>(r, b);
-      reflFlat [idx]      = input.reflectivityImage->at<float>(r, b);
-      const cv::Vec3f n   = input.normalImage->at<cv::Vec3f>(r, b);
-      normalFlat[idx*3+0] = n[0];
-      normalFlat[idx*3+1] = n[1];
-      normalFlat[idx*3+2] = n[2];
+      }
+      const size_t idx = static_cast<size_t>(b) * nr + r;
+      depthFlat[idx] = input.depthImage->at<float>(r, b);
+      reflFlat[idx] = input.reflectivityImage->at<float>(r, b);
+      const cv::Vec3f n = input.normalImage->at<cv::Vec3f>(r, b);
+      normalFlat[idx * 3 + 0] = n[0];
+      normalFlat[idx * 3 + 1] = n[1];
+      normalFlat[idx * 3 + 2] = n[2];
     }
   }
 
-  //  Flatten beam corrector (identity if not provided) 
+  //  Flatten beam corrector (identity if not provided)
   std::vector<float> beamCorrFlat(static_cast<size_t>(nb) * nb, 0.f);
-  if (input.beamCorrector) {
+  if (input.beamCorrector)
+  {
     for (int r = 0; r < nb; ++r)
+    {
       for (int c = 0; c < nb; ++c)
+      {
         beamCorrFlat[r * nb + c] = input.beamCorrector[r][c];
-  } else {
+      }
+    }
+  }
+  else
+  {
     // Identity fallback: no cross-beam mixing
     for (int i = 0; i < nb; ++i)
+    {
       beamCorrFlat[i * nb + i] = 1.f;
+    }
   }
 
-  //  Persistent engine: recreate only when dimensions change 
+  //  Persistent engine: recreate only when dimensions change
   // engine_ is a class member (not static local) — safe for multiple sensors
-  if (!engine_ || engNb_ != nb || engNr_ != nr || engNf_ != nf) {
-    if (engine_) {
+  if (!engine_ || engNb_ != nb || engNr_ != nr || engNf_ != nf)
+  {
+    if (engine_)
+    {
       sonar_wgpu_destroy(engine_);
       engine_ = nullptr;
     }
 
     engine_ = sonar_wgpu_create(
-      static_cast<uint32_t>(nb),
-      static_cast<uint32_t>(nr),
-      static_cast<uint32_t>(nf),
-      static_cast<float>(input.soundSpeed),
-      static_cast<float>(input.bandwidth),
-      static_cast<float>(input.maxDistance),
-      static_cast<float>(input.attenuation),
-      static_cast<float>(input.sourceLevel),
-      input.sensorGain,
-      input.hFOV,
-      input.vFOV,
-      input.seed);
+      static_cast<uint32_t>(nb), static_cast<uint32_t>(nr), static_cast<uint32_t>(nf),
+      static_cast<float>(input.soundSpeed), static_cast<float>(input.bandwidth),
+      static_cast<float>(input.maxDistance), static_cast<float>(input.attenuation),
+      static_cast<float>(input.sourceLevel), input.sensorGain, input.hFOV, input.vFOV, input.seed);
 
     engNb_ = nb;
     engNr_ = nr;
     engNf_ = nf;
 
-    if (!engine_) {
+    if (!engine_)
+    {
       std::cerr << "[sonar_wgpu] sonar_wgpu_create failed "
                    "— switching to CPU fallback.\n";
       gpuAvailable_ = false;
-      if (!cpuFallback_) {
+      if (!cpuFallback_)
+      {
         cpuFallback_ = std::make_unique<CpuComputeBackend>();
         cpuFallback_->Initialize(input);
       }
@@ -287,39 +300,35 @@ bool WgpuComputeBackend::Compute(
     }
   }
 
-  //  GPU dispatch 
+  //  GPU dispatch
   float * result = sonar_wgpu_compute(
-    engine_,
-    depthFlat.data(),
-    normalFlat.data(),
-    reflFlat.data(),
-    beamCorrFlat.data(),
-    static_cast<uint32_t>(nb),
-    static_cast<uint32_t>(nr),
-    static_cast<uint32_t>(nf),
+    engine_, depthFlat.data(), normalFlat.data(), reflFlat.data(), beamCorrFlat.data(),
+    static_cast<uint32_t>(nb), static_cast<uint32_t>(nr), static_cast<uint32_t>(nf),
     input.frameIndex,
-    input.beamCorrectorSum > 0.f
-      ? input.beamCorrectorSum
-      : static_cast<float>(nb));
+    input.beamCorrectorSum > 0.f ? input.beamCorrectorSum : static_cast<float>(nb));
 
-  if (!result) {
+  if (!result)
+  {
     // Log failure explicitly — silent fallback makes debugging painful
     std::cerr << "[sonar_wgpu] sonar_wgpu_compute returned null "
                  "— CPU fallback for this frame.\n";
-    if (!cpuFallback_) {
+    if (!cpuFallback_)
+    {
       cpuFallback_ = std::make_unique<CpuComputeBackend>();
       cpuFallback_->Initialize(input);
     }
     return cpuFallback_->Compute(input, output);
   }
 
-  //  Unpack interleaved [re, im] result 
+  //  Unpack interleaved [re, im] result
   output.nBeams = nb;
-  output.nFreq  = nf;
+  output.nFreq = nf;
   output.beamSpectrum.resize(static_cast<size_t>(nb) * nf);
 
-  for (int b = 0; b < nb; ++b) {
-    for (int f = 0; f < nf; ++f) {
+  for (int b = 0; b < nb; ++b)
+  {
+    for (int f = 0; f < nf; ++f)
+    {
       const size_t i = (static_cast<size_t>(b) * nf + f) * 2;
       // explicit constructor — avoids initializer-list ambiguity with templates
       output.At(b, f) = std::complex<float>(result[i], result[i + 1]);
@@ -329,8 +338,8 @@ bool WgpuComputeBackend::Compute(
   sonar_wgpu_free(result, static_cast<size_t>(nb) * nf * 2);
 
   auto t1 = std::chrono::high_resolution_clock::now();
-  output.computeMicros = static_cast<double>(
-    std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count());
+  output.computeMicros =
+    static_cast<double>(std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count());
 
   return true;
 #endif  // HAVE_WGPU_BACKEND
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/include/sonar_wgpu.h b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/include/sonar_wgpu.h
index 61e72186..f86bdf0b 100644
--- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/include/sonar_wgpu.h
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/include/sonar_wgpu.h
@@ -7,67 +7,42 @@
 #include <stdlib.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-/* Opaque handle to Rust-side engine */
-typedef struct SonarPhysicsEngine SonarPhysicsEngine;
+  /* Opaque handle to Rust-side engine */
+  typedef struct SonarPhysicsEngine SonarPhysicsEngine;
 
+  /* Create GPU sonar engine.
+     Returns NULL if GPU init fails (caller should fallback to CPU). */
+  SonarPhysicsEngine * sonar_wgpu_create(
+    uint32_t n_beams, uint32_t n_rays, uint32_t n_freq, float sound_speed, float bandwidth,
+    float max_range, float attenuation, float source_level, float sensor_gain, float h_fov,
+    float v_fov, uint64_t seed);
 
-/* Create GPU sonar engine.
-   Returns NULL if GPU init fails (caller should fallback to CPU). */
-SonarPhysicsEngine *sonar_wgpu_create(uint32_t n_beams,
-                                      uint32_t n_rays,
-                                      uint32_t n_freq,
-                                      float sound_speed,
-                                      float bandwidth,
-                                      float max_range,
-                                      float attenuation,
-                                      float source_level,
-                                      float sensor_gain,
-                                      float h_fov,
-                                      float v_fov,
-                                      uint64_t seed);
+  /* Run one frame.
+     Output layout: [re0, im0, re1, im1, ...] (beam-major).
+     Size = n_beams * n_freq * 2.
+     Caller owns memory → free with sonar_wgpu_free(). */
+  float * sonar_wgpu_compute(
+    SonarPhysicsEngine * engine, const float * depth, const float * normals, const float * refl,
+    const float * beam_corr, uint32_t n_beams, uint32_t n_rays, uint32_t n_freq, uint64_t frame,
+    float beam_corr_sum);
 
+  /* Free buffer returned by compute */
+  void sonar_wgpu_free(float * ptr, uintptr_t len);
 
-/* Run one frame.
-   Output layout: [re0, im0, re1, im1, ...] (beam-major).
-   Size = n_beams * n_freq * 2.
-   Caller owns memory → free with sonar_wgpu_free(). */
-float *sonar_wgpu_compute(SonarPhysicsEngine *engine,
-                          const float *depth,
-                          const float *normals,
-                          const float *refl,
-                          const float *beam_corr,
-                          uint32_t n_beams,
-                          uint32_t n_rays,
-                          uint32_t n_freq,
-                          uint64_t frame,
-                          float beam_corr_sum);
+  /* Destroy engine + release GPU resources */
+  void sonar_wgpu_destroy(void * engine);
 
+  /* --- CPU / legacy path (kept for compatibility) --- */
 
-/* Free buffer returned by compute */
-void sonar_wgpu_free(float *ptr, uintptr_t len);
-
-
-/* Destroy engine + release GPU resources */
-void sonar_wgpu_destroy(void *engine);
-
-
-/* --- CPU / legacy path (kept for compatibility) --- */
-
-SonarPhysicsEngine *sonar_physics_create(uint32_t n_beams,
-                                         uint32_t n_rays,
-                                         uint32_t n_freq,
-                                         float sound_speed,
-                                         float bandwidth,
-                                         float max_range,
-                                         float attenuation,
-                                         float h_fov,
-                                         float v_fov);
-
-void sonar_physics_destroy(SonarPhysicsEngine *engine);
+  SonarPhysicsEngine * sonar_physics_create(
+    uint32_t n_beams, uint32_t n_rays, uint32_t n_freq, float sound_speed, float bandwidth,
+    float max_range, float attenuation, float h_fov, float v_fov);
 
+  void sonar_physics_destroy(SonarPhysicsEngine * engine);
 
 #ifdef __cplusplus
 }
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/fft.wgsl b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/fft.wgsl
index 48bf9a20..ba86574a 100644
--- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/fft.wgsl
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/shaders/fft.wgsl
@@ -47,7 +47,7 @@ fn main(
     let n = params.n_freq;
     let base = beam * n;
 
-    //  Step 1: load + bit-reversal 
+    //  Step 1: load + bit-reversal
     // FFT expects bit-reversed ordering before butterfly stages
     // each thread loads multiple elements (stride = workgroup size)
     for (var i = lid.x; i < n; i += 256u) {
@@ -59,7 +59,7 @@ fn main(
     }
     workgroupBarrier();
 
-    //  Step 2: butterfly stages 
+    //  Step 2: butterfly stages
     // log2(n) stages, each doubling the merge size
     for (var s = 0u; s < params.log2_n; s++) {
 
@@ -101,7 +101,7 @@ fn main(
         workgroupBarrier();
     }
 
-    //  Step 3: write back 
+    //  Step 3: write back
     // output is time-domain signal per beam
     // range mapping is handled outside (t → r conversion)
     for (var i = lid.x; i < n; i += 256u) {
diff --git a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/physics_engine.rs b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/physics_engine.rs
index db965ccf..c33904b6 100644
--- a/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/physics_engine.rs
+++ b/gazebo/dave_gz_multibeam_sonar/multibeam_sonar/wgpu_backend/src/physics_engine.rs
@@ -227,7 +227,7 @@ impl SonarPhysicsEngine {
 
         let readback_buf = mk(spectrum, CD | MR);
 
-        //  uniforms 
+        //  uniforms
         let scatter_params = BackscatterParams {
             n_beams: config.n_beams,
             n_rays: config.n_rays,
@@ -296,7 +296,7 @@ impl SonarPhysicsEngine {
             source: wgpu::ShaderSource::Wgsl(include_str!("../shaders/fft.wgsl").into()),
         });
 
-        //  pipelines 
+        //  pipelines
         let scatter_bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
             label: None,
             entries: &[