diff --git a/coretrace/CMakeLists.txt b/coretrace/CMakeLists.txt
index bc237a00..85e6c02c 100644
--- a/coretrace/CMakeLists.txt
+++ b/coretrace/CMakeLists.txt
@@ -58,7 +58,6 @@ set(CORETRACE_SRC
 	treemesh.cpp
 	types.cpp
 	vshot.cpp
-
 	simdata_bridge.cpp
 )
 
diff --git a/coretrace/simdriver/main.cpp b/coretrace/simdriver/main.cpp
index 2cb9fff3..e99d4e32 100644
--- a/coretrace/simdriver/main.cpp
+++ b/coretrace/simdriver/main.cpp
@@ -2,21 +2,26 @@
  * @file main.cpp
  * @brief Command-line driver for SolTrace ray tracing.
  *
- * Reads a JSON file to configure SimulationData, runs the ray tracer,
+ * Reads a JSON or .stinput file to configure SimulationData, runs the ray tracer,
  * and writes the ray interaction records to a CSV file.
  *
  * Usage:
- *   simdriver <input.json> <output.csv> [options]
+ *   simdriver <input.json|input.stinput> [<output.csv>] [options]
  *
  * Options:
  *   --threads <n>   Number of parallel threads (default: 1)
  *   --rays <n>      Override the number of rays from the JSON file
+ *   --no-output     Skip result retrieval and CSV output (output file not
+ *                   required when this flag is set)
+ *   --no-csv        Retrieve results but skip writing the CSV file (output
+ *                   file argument not required when this flag is set)
  *   --embree        Use the Embree runner (only available if built with
  *                   SOLTRACE_BUILD_EMBREE_SUPPORT=ON; falls back to native
  *                   runner with a warning if Embree support is absent)
  *   --optix         Use the OptiX runner (only available if built with
  *                   SOLTRACE_BUILD_OPTIX_SUPPORT=ON; falls back to native
  *                   runner with a warning if OptiX support is absent)
+ *   --verbose       Enable verbose logging in the OptiX runner
  */
 
 #include <cstdlib>
@@ -47,11 +52,15 @@ using SolTrace::Runner::RunnerStatus;
 static void print_usage(const char *prog)
 {
     std::cerr
-        << "Usage: " << prog << " <input.json> <output.csv> [options]\n"
+        << "Usage: " << prog << " <input.json|input.stinput> [<output.csv>] [options]\n"
         << "\n"
         << "Options:\n"
         << "  --threads <n>   Number of threads (default: 1)\n"
         << "  --rays <n>      Override number of rays specified in the JSON file\n"
+        << "  --no-output     Skip result retrieval and CSV output\n"
+        << "                  (output file argument not required with this flag)\n"
+        << "  --no-csv        Retrieve results but skip writing the CSV file\n"
+        << "                  (output file argument not required with this flag)\n"
 #ifdef SOLTRACE_EMBREE_SUPPORT
         << "  --embree        Use Embree runner instead of the native runner\n"
         << "                  (requires SOLTRACE_BUILD_EMBREE_SUPPORT=ON at build time)\n"
@@ -60,26 +69,54 @@ static void print_usage(const char *prog)
         << "  --optix         Use OptiX runner instead of the native runner\n"
         << "                  (requires SOLTRACE_BUILD_OPTIX_SUPPORT=ON at build time)\n"
 #endif
+        << "  --verbose       Enable verbose logging in the OptiX runner\n"
         ;
 }
 
 int main(int argc, char *argv[])
 {
-    if (argc < 3)
+    if (argc < 2)
     {
         print_usage(argv[0]);
         return EXIT_FAILURE;
     }
 
+    // Pre-scan for --no-output and --no-csv so we know whether output_file is required
+    bool skip_output = false;
+    bool skip_csv = false;
+    for (int i = 2; i < argc; ++i)
+    {
+        const std::string a = argv[i];
+        if (a == "--no-output") skip_output = true;
+        else if (a == "--no-csv") skip_csv = true;
+    }
+
+    const bool file_optional = skip_output || skip_csv;
+
     const std::string input_file = argv[1];
-    const std::string output_file = argv[2];
+
+    // argv[2], if present and not a flag (does not start with --), is treated as
+    // the output file path.  This allows the user to supply an output path even
+    // when --no-output or --no-csv is also present without it being mis-parsed
+    // as an unknown option.
+    const bool has_output_arg = (argc >= 3) && (std::string(argv[2]).rfind("--", 0) != 0);
+    const std::string output_file = has_output_arg ? argv[2] : "";
+    const int opts_start = has_output_arg ? 3 : 2;
+
+    if (!file_optional && !has_output_arg)
+    {
+        std::cerr << "Error: output file is required unless --no-output or --no-csv is specified\n";
+        print_usage(argv[0]);
+        return EXIT_FAILURE;
+    }
 
     int num_threads = 1;
     long long num_rays_override = -1; // -1 means use what the JSON specifies
     bool use_embree = false;
     bool use_optix = false;
+    bool verbose = false;
 
-    for (int i = 3; i < argc; ++i)
+    for (int i = opts_start; i < argc; ++i)
     {
         const std::string arg = argv[i];
         if (arg == "--threads")
@@ -126,6 +163,10 @@ int main(int argc, char *argv[])
                 return EXIT_FAILURE;
             }
         }
+        else if (arg == "--no-output" || arg == "--no-csv")
+        {
+            // already handled in pre-scan; skip here
+        }
         else if (arg == "--embree")
         {
             use_embree = true;
@@ -134,6 +175,10 @@ int main(int argc, char *argv[])
         {
             use_optix = true;
         }
+        else if (arg == "--verbose")
+        {
+            verbose = true;
+        }
         else
         {
             std::cerr << "Error: unknown option '" << arg << "'\n";
@@ -143,23 +188,55 @@ int main(int argc, char *argv[])
     }
 
     // -------------------------------------------------------------------------
-    // Load simulation data from JSON
+    // Load simulation data from JSON or .stinput file
     // -------------------------------------------------------------------------
     SimulationData simData;
-    try
     {
+        // Determine format by extension
+        auto ends_with = [](const std::string &s, const std::string &suffix)
+        {
+            return s.size() >= suffix.size() &&
+                   s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0;
+        };
+        const bool is_stinput = ends_with(input_file, ".stinput");
+        const bool is_json = ends_with(input_file, ".json");
+
+        if (!is_stinput && !is_json)
+        {
+            std::cerr << "Error: unrecognised input file extension (expected .json or .stinput): "
+                      << input_file << "\n";
+            return EXIT_FAILURE;
+        }
+
         std::cout << "Loading simulation data from: " << input_file << "...\n";
         auto t_load_start = std::chrono::steady_clock::now();
-        simData.import_json_file(input_file);
+
+        if (is_json)
+        {
+            try
+            {
+                simData.import_json_file(input_file);
+            }
+            catch (const std::exception &e)
+            {
+                std::cerr << "Error loading JSON file: " << e.what() << "\n";
+                return EXIT_FAILURE;
+            }
+        }
+        else // .stinput
+        {
+            if (!simData.import_from_file(input_file))
+            {
+                std::cerr << "Error loading .stinput file: " << input_file << "\n";
+                return EXIT_FAILURE;
+            }
+        }
+
         auto t_load_end = std::chrono::steady_clock::now();
         std::cout << "  Loaded in "
                   << std::chrono::duration<double>(t_load_end - t_load_start).count()
-                  << " s\n";
-    }
-    catch (const std::exception &e)
-    {
-        std::cerr << "Error loading JSON file: " << e.what() << "\n";
-        return EXIT_FAILURE;
+                  << " s\n"
+                  << "  Elements loaded: " << simData.get_number_of_elements() << "\n";
     }
 
     // Override ray count if the user requested it
@@ -217,24 +294,33 @@ int main(int argc, char *argv[])
         std::cout << "  Completed in "
                   << std::chrono::duration<double>(t_run_end - t_run_start).count()
                   << " s\n";
+        std::cout << "  Rays launched: " << runner.get_number_rays_launched() << "\n";
+        std::cout << "  Rays traced:   " << runner.get_number_rays_traced() << "\n";
 
-        std::cout << "Retrieving results...\n";
-        auto t_report_start = std::chrono::steady_clock::now();
-        sts = runner.report_simulation(&result, 0);
-        auto t_report_end = std::chrono::steady_clock::now();
-        if (sts != RunnerStatus::SUCCESS)
+        if (!skip_output)
         {
-            std::cerr << "Error: failed to collect simulation results\n";
-            return EXIT_FAILURE;
+            std::cout << "Retrieving results...\n";
+            auto t_report_start = std::chrono::steady_clock::now();
+            sts = runner.report_simulation(&result, 0);
+            auto t_report_end = std::chrono::steady_clock::now();
+            if (sts != RunnerStatus::SUCCESS)
+            {
+                std::cerr << "Error: failed to collect simulation results\n";
+                return EXIT_FAILURE;
+            }
+            std::cout << "  Retrieved in "
+                      << std::chrono::duration<double>(t_report_end - t_report_start).count()
+                      << " s\n";
+        }
+        else
+        {
+            std::cout << "Skipping result retrieval (--no-output).\n";
         }
-        std::cout << "  Retrieved in "
-                  << std::chrono::duration<double>(t_report_end - t_report_start).count()
-                  << " s\n";
     }
     else
 #endif
 #ifdef SOLTRACE_OPTIX_SUPPORT
-    if (use_optix)
+        if (use_optix)
     {
         OptixRunner runner;
 
@@ -245,6 +331,8 @@ int main(int argc, char *argv[])
             return EXIT_FAILURE;
         }
 
+        runner.set_verbose(verbose);
+
         std::cout << "Using OptiX runner\n";
 
         std::cout << "Setting up simulation...\n";
@@ -272,19 +360,33 @@ int main(int argc, char *argv[])
         std::cout << "  Completed in "
                   << std::chrono::duration<double>(t_run_end - t_run_start).count()
                   << " s\n";
+        std::cout << "  Rays launched: " << runner.get_number_rays_launched() << "\n";
+        std::cout << "  Rays traced:   " << runner.get_number_rays_traced() << "\n";
 
-        std::cout << "Retrieving results...\n";
-        auto t_report_start = std::chrono::steady_clock::now();
-        sts = runner.report_simulation(&result, 0);
-        auto t_report_end = std::chrono::steady_clock::now();
-        if (sts != RunnerStatus::SUCCESS)
+        if (!skip_output)
         {
-            std::cerr << "Error: failed to collect simulation results\n";
-            return EXIT_FAILURE;
+            std::cout << "Retrieving results...\n";
+            auto t_report_start = std::chrono::steady_clock::now();
+            sts = runner.report_simulation(&result, 0);
+            auto t_report_end = std::chrono::steady_clock::now();
+            if (sts != RunnerStatus::SUCCESS)
+            {
+                std::cerr << "Error: failed to collect simulation results\n";
+                return EXIT_FAILURE;
+            }
+            std::cout << "  Retrieved in "
+                      << std::chrono::duration<double>(t_report_end - t_report_start).count()
+                      << " s\n";
+        }
+        else
+        {
+            std::cout << "Skipping result retrieval (--no-output).\n";
+        }
+
+        if (!verbose)
+        {
+            runner.print_timing();
         }
-        std::cout << "  Retrieved in "
-                  << std::chrono::duration<double>(t_report_end - t_report_start).count()
-                  << " s\n";
     }
     else
 #endif
@@ -341,39 +443,59 @@ int main(int argc, char *argv[])
         std::cout << "  Completed in "
                   << std::chrono::duration<double>(t_run_end - t_run_start).count()
                   << " s\n";
+        std::cout << "  Rays launched: " << runner.get_number_rays_launched() << "\n";
+        std::cout << "  Rays traced:   " << runner.get_number_rays_traced() << "\n";
 
-        std::cout << "Retrieving results...\n";
-        auto t_report_start = std::chrono::steady_clock::now();
-        sts = runner.report_simulation(&result, 0);
-        auto t_report_end = std::chrono::steady_clock::now();
-        if (sts != RunnerStatus::SUCCESS)
+        if (!skip_output)
         {
-            std::cerr << "Error: failed to collect simulation results\n";
-            return EXIT_FAILURE;
+            std::cout << "Retrieving results...\n";
+            auto t_report_start = std::chrono::steady_clock::now();
+            sts = runner.report_simulation(&result, 0);
+            auto t_report_end = std::chrono::steady_clock::now();
+            if (sts != RunnerStatus::SUCCESS)
+            {
+                std::cerr << "Error: failed to collect simulation results\n";
+                return EXIT_FAILURE;
+            }
+            std::cout << "  Retrieved in "
+                      << std::chrono::duration<double>(t_report_end - t_report_start).count()
+                      << " s\n";
+        }
+        else
+        {
+            std::cout << "Skipping result retrieval (--no-output).\n";
         }
-        std::cout << "  Retrieved in "
-                  << std::chrono::duration<double>(t_report_end - t_report_start).count()
-                  << " s\n";
     }
 
     // -------------------------------------------------------------------------
     // Write results to CSV
     // -------------------------------------------------------------------------
-    std::cout << "Writing " << result.get_number_of_records()
-              << " ray records to: " << output_file << "...\n";
-    try
+    if (!skip_output && !skip_csv)
     {
-        auto t_write_start = std::chrono::steady_clock::now();
-        result.write_csv_file(output_file);
-        auto t_write_end = std::chrono::steady_clock::now();
-        std::cout << "  Written in "
-                  << std::chrono::duration<double>(t_write_end - t_write_start).count()
-                  << " s\n";
+        std::cout << "Writing " << result.get_number_of_records()
+                  << " ray records to: " << output_file << "...\n";
+        try
+        {
+            auto t_write_start = std::chrono::steady_clock::now();
+            result.write_csv_file(output_file);
+            auto t_write_end = std::chrono::steady_clock::now();
+            std::cout << "  Written in "
+                      << std::chrono::duration<double>(t_write_end - t_write_start).count()
+                      << " s\n";
+        }
+        catch (const std::exception &e)
+        {
+            std::cerr << "Error writing CSV file: " << e.what() << "\n";
+            return EXIT_FAILURE;
+        }
     }
-    catch (const std::exception &e)
+    else if (skip_csv)
     {
-        std::cerr << "Error writing CSV file: " << e.what() << "\n";
-        return EXIT_FAILURE;
+        std::cout << "Skipping CSV output (--no-csv).\n";
+    }
+    else
+    {
+        std::cout << "Skipping CSV output (--no-output).\n";
     }
 
     std::cout << "Done.\n";
diff --git a/coretrace/simulation_data/simulation_parameters.hpp b/coretrace/simulation_data/simulation_parameters.hpp
index be15daa9..6fb66717 100644
--- a/coretrace/simulation_data/simulation_parameters.hpp
+++ b/coretrace/simulation_data/simulation_parameters.hpp
@@ -21,9 +21,6 @@ struct SimulationParameters
     // TODO: Figure out how to store time...
     DateTime sim_dt;
 
-    bool include_sun_shape_errors;
-    bool include_optical_errors;
-
     std::uint_fast64_t number_of_rays;
     std::uint_fast64_t max_number_of_rays;
     double tolerance;
@@ -33,6 +30,9 @@ struct SimulationParameters
 
     int seed;
 
+    bool include_sun_shape_errors;
+    bool include_optical_errors;
+
     SimulationParameters() : number_of_rays(10000),
                              max_number_of_rays(1000000),
                              tolerance(0.0),
diff --git a/coretrace/simulation_runner/embree_runner/embree_runner.cpp b/coretrace/simulation_runner/embree_runner/embree_runner.cpp
index 52484840..5f66d9e7 100644
--- a/coretrace/simulation_runner/embree_runner/embree_runner.cpp
+++ b/coretrace/simulation_runner/embree_runner/embree_runner.cpp
@@ -50,8 +50,7 @@ namespace SolTrace::EmbreeRunner
     {
         // TODO: Do a more efficient implementation of this?
         this->clean_embree();
-        NativeRunner::update_simulation(data);
-        return RunnerStatus::SUCCESS;
+        return NativeRunner::update_simulation(data);
     }
 
     RunnerStatus EmbreeRunner::run_simulation()
diff --git a/coretrace/simulation_runner/embree_runner/ftz_daz.hpp b/coretrace/simulation_runner/embree_runner/ftz_daz.hpp
new file mode 100644
index 00000000..742d4b17
--- /dev/null
+++ b/coretrace/simulation_runner/embree_runner/ftz_daz.hpp
@@ -0,0 +1,39 @@
+#ifndef SOLTRACE_FTZ_DAZ_HPP
+#define SOLTRACE_FTZ_DAZ_HPP
+
+#include <cstdint>
+
+// Set Flush-to-Zero (FTZ) and Denormals-are-Zero (DAZ) floating-point flags
+// for the calling thread. These are thread-local CPU register settings that
+// avoid slow denormal handling in the FPU, as recommended by the Embree docs.
+// On ARM, DAZ is implicit when FZ is set (no separate bit).
+
+#if defined(__SSE__) || defined(_M_X64) || defined(_M_IX86)
+#  include <xmmintrin.h>
+// Set FTZ (bit 15, 0x8000) and DAZ (bit 6, 0x0040) via MXCSR.
+// <xmmintrin.h> is sufficient; <pmmintrin.h> (SSE3) is intentionally avoided
+// so this compiles on SSE-only targets (-msse without -msse3).
+#  define SOLTRACE_SET_FTZ_DAZ() \
+     _mm_setcsr(_mm_getcsr() | 0x8040u)
+#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
+#  if defined(_MSC_VER)
+#    include <intrin.h>
+#    define SOLTRACE_SET_FTZ_DAZ() do { \
+       uint64_t _fpcr = _ReadStatusReg(ARM64_FPCR); \
+       _fpcr |= (1ULL << 24); \
+       _WriteStatusReg(ARM64_FPCR, _fpcr); \
+     } while(0)
+#  else
+     /* GCC / Clang on ARM64 */
+#    define SOLTRACE_SET_FTZ_DAZ() do { \
+       uint64_t _fpcr; \
+       __asm__ __volatile__("mrs %0, fpcr" : "=r"(_fpcr)); \
+       _fpcr |= (1ULL << 24); \
+       __asm__ __volatile__("msr fpcr, %0" : : "r"(_fpcr)); \
+     } while(0)
+#  endif
+#else
+#  define SOLTRACE_SET_FTZ_DAZ() /* unsupported architecture, no-op */
+#endif
+
+#endif // SOLTRACE_FTZ_DAZ_HPP
diff --git a/coretrace/simulation_runner/embree_runner/trace_embree.cpp b/coretrace/simulation_runner/embree_runner/trace_embree.cpp
index 0e36f03a..d3720dfe 100644
--- a/coretrace/simulation_runner/embree_runner/trace_embree.cpp
+++ b/coretrace/simulation_runner/embree_runner/trace_embree.cpp
@@ -1,7 +1,9 @@
 #include "trace_embree.hpp"
 
 #include <algorithm>
+#include <chrono>
 #include <cmath>
+#include <fstream>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -27,6 +29,7 @@
 
 #include "embree_helper.hpp"
 #include "find_element_hit_embree.hpp"
+#include "ftz_daz.hpp"
 
 namespace SolTrace::EmbreeRunner
 {
@@ -56,11 +59,16 @@ namespace SolTrace::EmbreeRunner
         // RTCScene embree_scene = nullptr;
         // bool use_shared_embree = false;
 
-        // Make device
-        // std::cout << "Making embree device..." << std::endl;
-        std::stringstream ss;
-        ss << "threads=" << nthreads;
-        embree_device = rtcNewDevice(ss.str().c_str());
+        // // Make device
+        // // std::cout << "Making embree device..." << std::endl;
+        // std::stringstream ss;
+        // ss << "threads=" << nthreads;
+        // embree_device = rtcNewDevice(ss.str().c_str());
+        
+        // TODO: Need to test this on largest scenes we expect to
+        // trace. Adding more threads does not significantly help
+        // when there are only ~6000 elements.
+        embree_device = rtcNewDevice("threads=1");
 
         // std::cout << "Setting error function..." << std::endl;
         rtcSetDeviceErrorFunction(embree_device, error_function, NULL);
@@ -98,9 +106,16 @@ namespace SolTrace::EmbreeRunner
         const RTCScene &embree_scene)
     {
 
+        // using Clock = std::chrono::steady_clock;
+        // using Seconds = std::chrono::duration<double>;
+
+        // auto t_start = Clock::now();
+
         System->RayData.SetUp(nthreads, NumberOfRays);
         System->SunRayCount = 0;
 
+        // auto t_after_setup = Clock::now();
+
         // Initialize Sun
         glm::dvec3 PosSunStage;
         bool status = SolTrace::NativeRunner::SunToPrimaryStage(logger,
@@ -112,6 +127,8 @@ namespace SolTrace::EmbreeRunner
         if (!status)
             return RunnerStatus::ERROR;
 
+        // auto t_after_sun_init = Clock::now();
+
         uint_fast64_t rem = NumberOfRays % nthreads;
         uint_fast64_t nrays_per_thread = NumberOfRays / nthreads;
         uint_fast64_t nrays;
@@ -138,7 +155,27 @@ namespace SolTrace::EmbreeRunner
             manager->manage(k, std::move(my_future));
         }
 
-        return manager->monitor_until_completion();
+        // auto t_after_launch = Clock::now();
+
+        RunnerStatus result = manager->monitor_until_completion();
+
+        // auto t_done = Clock::now();
+
+        // double s_setup     = Seconds(t_after_setup    - t_start).count();
+        // double s_sun_init  = Seconds(t_after_sun_init - t_after_setup).count();
+        // double s_launch    = Seconds(t_after_launch   - t_after_sun_init).count();
+        // double s_ray_trace = Seconds(t_done           - t_after_launch).count();
+        // double s_total     = Seconds(t_done           - t_start).count();
+
+        // std::cout << "[trace_embree] timing (nthreads=" << nthreads
+        //           << ", rays=" << NumberOfRays << ")\n"
+        //           << "  ray_data_setup : " << s_setup     << " s\n"
+        //           << "  sun_init       : " << s_sun_init  << " s\n"
+        //           << "  thread_launch  : " << s_launch    << " s\n"
+        //           << "  ray_trace      : " << s_ray_trace << " s\n"
+        //           << "  total          : " << s_total     << " s\n";
+
+        return result;
     }
 
     RunnerStatus trace_embree_single_thread(
@@ -157,6 +194,10 @@ namespace SolTrace::EmbreeRunner
     {
         // std::cout << "Thread " << thread_id << " with seed " << seed
         //           << std::endl;
+        // Set flush-to-zero and denormals-are-zero for this thread to avoid
+        // slow denormal handling in the FPU (recommended by Embree docs).
+        SOLTRACE_SET_FTZ_DAZ();
+
         // Initialize Internal State Variables
         MTRand myrng(seed);
 
@@ -180,6 +221,72 @@ namespace SolTrace::EmbreeRunner
         uint_fast64_t n_rays_active = NumberOfRays;
         uint_fast64_t sun_ray_count_local = 0;
 
+        // // Timing accumulators
+        // using Clock = std::chrono::steady_clock;
+        // using ns_t = long long;
+        // ns_t t_generate_ray = 0;
+        // ns_t t_transform_to_local = 0;
+        // ns_t t_find_element_hit = 0;
+        // ns_t t_determine_interaction = 0;
+        // ns_t t_process_interaction = 0;
+        // ns_t t_transform_to_reference = 0;
+        // ns_t t_ray_data_append = 0;
+        // ns_t t_progress_update = 0;
+        // uint_fast64_t n_find_element_hit = 0;
+        // uint_fast64_t n_determine_interaction = 0;
+        // uint_fast64_t n_process_interaction = 0;
+        // uint_fast64_t n_ray_data_append = 0;
+
+        // auto write_timing = [&]() {
+        //     std::string fname = "trace_embree_timing_thread_" +
+        //                         std::to_string(thread_id) + ".csv";
+        //     std::ofstream f(fname);
+        //     constexpr double ns_to_s = 1.0e-9;
+        //     ns_t t_total = t_generate_ray + t_transform_to_local + t_find_element_hit +
+        //                    t_determine_interaction + t_process_interaction +
+        //                    t_transform_to_reference + t_ray_data_append + t_progress_update;
+        //     auto pct = [&](ns_t t) -> double {
+        //         return t_total > 0 ? 100.0 * static_cast<double>(t) / static_cast<double>(t_total) : 0.0;
+        //     };
+        //     f << std::fixed;
+        //     f << "section,calls,seconds,pct_total\n";
+        //     f << "generate_ray,,"         << t_generate_ray        * ns_to_s << "," << pct(t_generate_ray)        << "\n";
+        //     f << "transform_to_local,,"   << t_transform_to_local  * ns_to_s << "," << pct(t_transform_to_local)  << "\n";
+        //     f << "find_element_hit,"      << n_find_element_hit    << "," << t_find_element_hit      * ns_to_s << "," << pct(t_find_element_hit)      << "\n";
+        //     f << "determine_interaction," << n_determine_interaction << "," << t_determine_interaction * ns_to_s << "," << pct(t_determine_interaction) << "\n";
+        //     f << "process_interaction,"   << n_process_interaction  << "," << t_process_interaction   * ns_to_s << "," << pct(t_process_interaction)   << "\n";
+        //     f << "transform_to_reference,,"<< t_transform_to_reference * ns_to_s << "," << pct(t_transform_to_reference) << "\n";
+        //     f << "ray_data_append,"       << n_ray_data_append     << "," << t_ray_data_append        * ns_to_s << "," << pct(t_ray_data_append)       << "\n";
+        //     f << "progress_update,,"      << t_progress_update     * ns_to_s << "," << pct(t_progress_update)     << "\n";
+        //     f << "total,,"                << t_total               * ns_to_s << ",100.0\n";
+        // };
+
+        // auto write_timing = [&]() {
+        //     // std::string fname = "trace_embree_timing_thread_" +
+        //     //                     std::to_string(thread_id) + ".csv";
+        //     // std::ofstream f(fname);
+        //     std::stringstream f;
+        //     constexpr double ns_to_s = 1.0e-9;
+        //     ns_t t_total = t_generate_ray + t_transform_to_local + t_find_element_hit +
+        //                    t_determine_interaction + t_process_interaction +
+        //                    t_transform_to_reference + t_ray_data_append + t_progress_update;
+        //     auto pct = [&](ns_t t) -> double {
+        //         return t_total > 0 ? 100.0 * static_cast<double>(t) / static_cast<double>(t_total) : 0.0;
+        //     };
+        //     f << "thread_id " << thread_id << "\n" << std::fixed;
+        //     f << "section,calls,seconds,pct_total\n";
+        //     f << "generate_ray,,"         << t_generate_ray        * ns_to_s << "," << pct(t_generate_ray)        << "\n";
+        //     f << "transform_to_local,,"   << t_transform_to_local  * ns_to_s << "," << pct(t_transform_to_local)  << "\n";
+        //     f << "find_element_hit,"      << n_find_element_hit    << "," << t_find_element_hit      * ns_to_s << "," << pct(t_find_element_hit)      << "\n";
+        //     f << "determine_interaction," << n_determine_interaction << "," << t_determine_interaction * ns_to_s << "," << pct(t_determine_interaction) << "\n";
+        //     f << "process_interaction,"   << n_process_interaction  << "," << t_process_interaction   * ns_to_s << "," << pct(t_process_interaction)   << "\n";
+        //     f << "transform_to_reference,,"<< t_transform_to_reference * ns_to_s << "," << pct(t_transform_to_reference) << "\n";
+        //     f << "ray_data_append,"       << n_ray_data_append     << "," << t_ray_data_append        * ns_to_s << "," << pct(t_ray_data_append)       << "\n";
+        //     f << "progress_update,,"      << t_progress_update     * ns_to_s << "," << pct(t_progress_update)     << "\n";
+        //     f << "total,,"                << t_total               * ns_to_s << ",100.0\n";
+        //     std::cout << f.str() << std::endl;
+        // };
+
         // Loop through stages
         for (uint_fast64_t i = 0; i < System->StageList.size(); i++)
         {
@@ -242,9 +349,14 @@ namespace SolTrace::EmbreeRunner
                 }
 
                 // transform the global incoming ray to local stage coordinates
-                TransformToLocal(PosRayGlob, CosRayGlob,
-                                 Stage->Origin, Stage->RRefToLoc,
-                                 PosRayStage, CosRayStage);
+                {
+                    // auto _t0 = Clock::now();
+                    TransformToLocal(PosRayGlob, CosRayGlob,
+                                     Stage->Origin, Stage->RRefToLoc,
+                                     PosRayStage, CosRayStage);
+                    // t_transform_to_local += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                    //     Clock::now() - _t0).count();
+                }
 
                 // Initialize internal variables for ray intersection tracing
                 bool RayInStage = true;
@@ -270,12 +382,18 @@ namespace SolTrace::EmbreeRunner
                 while (RayInStage)
                 {
 
-                    FindElementHit_embree(embree_scene, i, RayNumber,
-                                          PosRayGlob, CosRayGlob,
-                                          LastPosRaySurfElement, LastCosRaySurfElement,
-                                          LastDFXYZ, LastElementNumber, LastRayNumber,
-                                          LastPosRaySurfStage, LastCosRaySurfStage,
-                                          ErrorFlag, LastHitBackSide, StageHit);
+                    {
+                        // auto _t0 = Clock::now();
+                        FindElementHit_embree(embree_scene, i, RayNumber,
+                                              PosRayGlob, CosRayGlob,
+                                              LastPosRaySurfElement, LastCosRaySurfElement,
+                                              LastDFXYZ, LastElementNumber, LastRayNumber,
+                                              LastPosRaySurfStage, LastCosRaySurfStage,
+                                              ErrorFlag, LastHitBackSide, StageHit);
+                        // t_find_element_hit += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        //     Clock::now() - _t0).count();
+                        // ++n_find_element_hit;
+                    }
 
                     // Breakout if ray left stage
                     if (!StageHit)
@@ -290,6 +408,7 @@ namespace SolTrace::EmbreeRunner
                     if (i == 0 && MultipleHitCount == 1)
                     {
                         // Add ray to Stage RayData
+                        // auto _t0_append = Clock::now();
                         auto r = System->RayData.Append(thread_id,
                                                         PosRayGlob,
                                                         CosRayGlob,
@@ -297,6 +416,9 @@ namespace SolTrace::EmbreeRunner
                                                         i + 1,
                                                         LastRayNumber,
                                                         RayEvent::CREATE);
+                        // t_ray_data_append += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        //     Clock::now() - _t0_append).count();
+                        // ++n_ray_data_append;
 
                         if (r == nullptr)
                         {
@@ -324,8 +446,10 @@ namespace SolTrace::EmbreeRunner
                         else
                             optics = &optelm->Optics.Front;
 
-                        bool good =
-                            SolTrace::NativeRunner::determine_interaction_type(
+                        bool good;
+                        {
+                            // auto _t0 = Clock::now();
+                            good = SolTrace::NativeRunner::determine_interaction_type(
                                 logger,
                                 i,
                                 thread_id,
@@ -334,9 +458,14 @@ namespace SolTrace::EmbreeRunner
                                 LastDFXYZ,
                                 LastCosRaySurfElement,
                                 rev);
+                            // t_determine_interaction += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                            //     Clock::now() - _t0).count();
+                            // ++n_determine_interaction;
+                        }
 
                         if (!good)
                         {
+                            // write_timing();
                             return RunnerStatus::ERROR;
                         }
 
@@ -349,43 +478,60 @@ namespace SolTrace::EmbreeRunner
 
                     // Process Interaction
                     int k = LastElementNumber - 1;
-                    SolTrace::NativeRunner::ProcessInteraction(
-                        System,
-                        myrng,
-                        IncludeSunShape,
-                        optics,
-                        IncludeErrors,
-                        i,
-                        Stage,
-                        MultipleHitCount,
-                        LastDFXYZ,
-                        LastCosRaySurfElement,
-                        ErrorFlag,
-                        CosRayOutElement,
-                        LastPosRaySurfElement,
-                        PosRayOutElement);
+                    {
+                        // auto _t0 = Clock::now();
+                        SolTrace::NativeRunner::ProcessInteraction(
+                            System,
+                            myrng,
+                            IncludeSunShape,
+                            optics,
+                            IncludeErrors,
+                            i,
+                            Stage,
+                            MultipleHitCount,
+                            LastDFXYZ,
+                            LastCosRaySurfElement,
+                            ErrorFlag,
+                            CosRayOutElement,
+                            LastPosRaySurfElement,
+                            PosRayOutElement);
+                        // t_process_interaction += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        //     Clock::now() - _t0).count();
+                        // ++n_process_interaction;
+                    }
 
                     // Transform ray back to stage coordinate system
-                    TransformToReference(PosRayOutElement,
-                                         CosRayOutElement,
-                                         Stage->ElementList[k]->Origin,
-                                         Stage->ElementList[k]->RLocToRef,
-                                         PosRayStage,
-                                         CosRayStage);
-                    TransformToReference(PosRayStage,
-                                         CosRayStage,
-                                         Stage->Origin,
-                                         Stage->RLocToRef,
-                                         PosRayGlob,
-                                         CosRayGlob);
-
-                    System->RayData.Append(thread_id,
-                                           PosRayGlob,
-                                           CosRayGlob,
-                                           LastElementNumber,
-                                           i + 1,
-                                           LastRayNumber,
-                                           rev);
+                    {
+                        // auto _t0 = Clock::now();
+                        TransformToReference(PosRayOutElement,
+                                             CosRayOutElement,
+                                             Stage->ElementList[k]->Origin,
+                                             Stage->ElementList[k]->RLocToRef,
+                                             PosRayStage,
+                                             CosRayStage);
+                        TransformToReference(PosRayStage,
+                                             CosRayStage,
+                                             Stage->Origin,
+                                             Stage->RLocToRef,
+                                             PosRayGlob,
+                                             CosRayGlob);
+                        // t_transform_to_reference += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        //     Clock::now() - _t0).count();
+                    }
+
+                    {
+                        // auto _t0 = Clock::now();
+                        System->RayData.Append(thread_id,
+                                               PosRayGlob,
+                                               CosRayGlob,
+                                               LastElementNumber,
+                                               i + 1,
+                                               LastRayNumber,
+                                               rev);
+                        // t_ray_data_append += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        //     Clock::now() - _t0).count();
+                        // ++n_ray_data_append;
+                    }
 
                     // Break out if multiple hits are not allowed
                     if (!Stage->MultiHitsPerRay)
@@ -404,29 +550,47 @@ namespace SolTrace::EmbreeRunner
 
                 if (update_count % update_rate == 0)
                 {
+                    // auto _t0 = Clock::now();
                     double progress = update_count / total_work;
                     manager->progress_update(thread_id, progress);
-                    if (manager->terminate(thread_id))
+                    bool should_cancel = manager->terminate(thread_id);
+                    // t_progress_update += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                    //     Clock::now() - _t0).count();
+                    if (should_cancel)
+                    {
+                        // write_timing();
                         return RunnerStatus::CANCEL;
+                    }
                 }
 
                 // Handle if Ray was absorbed
                 if (RayIsAbsorbed)
                 {
-                    TransformToReference(LastPosRaySurfStage,
-                                         LastCosRaySurfStage,
-                                         Stage->Origin,
-                                         Stage->RLocToRef,
-                                         PosRayGlob,
-                                         CosRayGlob);
-
-                    System->RayData.Append(thread_id,
-                                           PosRayGlob,
-                                           CosRayGlob,
-                                           LastElementNumber,
-                                           i + 1,
-                                           LastRayNumber,
-                                           RayEvent::ABSORB);
+                    {
+                        // auto _t0 = Clock::now();
+                        TransformToReference(LastPosRaySurfStage,
+                                             LastCosRaySurfStage,
+                                             Stage->Origin,
+                                             Stage->RLocToRef,
+                                             PosRayGlob,
+                                             CosRayGlob);
+                        // t_transform_to_reference += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        //     Clock::now() - _t0).count();
+                    }
+
+                    {
+                        // auto _t0 = Clock::now();
+                        System->RayData.Append(thread_id,
+                                               PosRayGlob,
+                                               CosRayGlob,
+                                               LastElementNumber,
+                                               i + 1,
+                                               LastRayNumber,
+                                               RayEvent::ABSORB);
+                        // t_ray_data_append += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        //     Clock::now() - _t0).count();
+                        // ++n_ray_data_append;
+                    }
 
                     n_rays_active--;
 
@@ -530,13 +694,19 @@ namespace SolTrace::EmbreeRunner
                     {
                         LastRayNumber = RayNumber;
 
-                        System->RayData.Append(thread_id,
-                                               PosRayGlob,
-                                               CosRayGlob,
-                                               ELEMENT_NULL,
-                                               i + 1,
-                                               LastRayNumber,
-                                               RayEvent::EXIT);
+                        {
+                            // auto _t0 = Clock::now();
+                            System->RayData.Append(thread_id,
+                                                   PosRayGlob,
+                                                   CosRayGlob,
+                                                   ELEMENT_NULL,
+                                                   i + 1,
+                                                   LastRayNumber,
+                                                   RayEvent::EXIT);
+                            // t_ray_data_append += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                            //     Clock::now() - _t0).count();
+                            // ++n_ray_data_append;
+                        }
 
                         n_rays_active--;
 
@@ -586,6 +756,7 @@ namespace SolTrace::EmbreeRunner
                     // size_t pp = IncomingRays[PreviousStageDataArrayIndex - 1].Num;
                     // System->errlog("LastRayNumberInPreviousStage=0, stage %d, PrevIdx=%d, CurIdx=%d, pp=%d", i + 1,
                     //                PreviousStageDataArrayIndex, StageDataArrayIndex, pp);
+                    // write_timing();
                     return RunnerStatus::ERROR;
                 }
             }
@@ -593,6 +764,7 @@ namespace SolTrace::EmbreeRunner
             {
                 // System->errlog("Invalid PreviousStageDataArrayIndex: %u, @ stage %d",
                 //                PreviousStageDataArrayIndex, i + 1);
+                // write_timing();
                 return RunnerStatus::ERROR;
             }
         }
@@ -603,13 +775,19 @@ namespace SolTrace::EmbreeRunner
         for (uint_fast64_t k = 0; k < n_rays_active; ++k)
         {
             GlobalRay_refactored ray = IncomingRays[k];
-            System->RayData.Append(thread_id,
-                                   ray.Pos,
-                                   ray.Cos,
-                                   ELEMENT_NULL,
-                                   idx + 1,
-                                   ray.Num,
-                                   RayEvent::EXIT);
+            {
+                // auto _t0 = Clock::now();
+                System->RayData.Append(thread_id,
+                                       ray.Pos,
+                                       ray.Cos,
+                                       ELEMENT_NULL,
+                                       idx + 1,
+                                       ray.Num,
+                                       RayEvent::EXIT);
+                // t_ray_data_append += std::chrono::duration_cast<std::chrono::nanoseconds>(
+                //     Clock::now() - _t0).count();
+                // ++n_ray_data_append;
+            }
         }
 
         // System->SunRayCount is atomic so this is thread safe
diff --git a/coretrace/simulation_runner/native_runner/native_runner.cpp b/coretrace/simulation_runner/native_runner/native_runner.cpp
index dba8e915..ca6341aa 100644
--- a/coretrace/simulation_runner/native_runner/native_runner.cpp
+++ b/coretrace/simulation_runner/native_runner/native_runner.cpp
@@ -285,8 +285,8 @@ namespace SolTrace::NativeRunner
     {
         // TODO: Do a more efficient implementation of this?
         this->tsys.ClearAll();
-        this->setup_simulation(data);
-        return RunnerStatus::SUCCESS;
+        return this->setup_simulation(data);
+        // return RunnerStatus::SUCCESS;
     }
 
     RunnerStatus NativeRunner::run_simulation()
@@ -327,7 +327,7 @@ namespace SolTrace::NativeRunner
 
         const TSystem *sys = this->get_system();
         // const TRayData ray_data = sys->AllRayData;
-        const TRayData ray_data = sys->RayData;
+        const TRayData& ray_data = sys->RayData;
         std::map<unsigned int, SolTrace::Result::ray_record_ptr> ray_records;
         std::map<unsigned int, SolTrace::Result::ray_record_ptr>::iterator iter;
         uint_fast64_t ndata = ray_data.Count();
diff --git a/coretrace/simulation_runner/native_runner/native_runner.hpp b/coretrace/simulation_runner/native_runner/native_runner.hpp
index 800b4d2e..4e8387f7 100644
--- a/coretrace/simulation_runner/native_runner/native_runner.hpp
+++ b/coretrace/simulation_runner/native_runner/native_runner.hpp
@@ -39,6 +39,18 @@ namespace SolTrace::NativeRunner
         virtual RunnerStatus report_simulation(SolTrace::Result::SimulationResult *result,
                                                int level_spec) override;
 
+        virtual uint_fast64_t get_number_rays_launched() const override
+        {
+            return tsys.SunRayCount;
+        }
+        virtual uint_fast64_t get_number_rays_traced() const override
+        {
+            // TODO: This could be wrong if we hit max number of rays before getting this many hits.
+            // At the moment max number of rays is ignored though...
+            // return tsys.sim_raycount;
+            return tsys.SunRayCount > 0 ? tsys.sim_raycount : 0;
+        }
+
         // Runner options
         void disable_power_tower() { this->as_power_tower = false; }
         void enable_power_tower() { this->as_power_tower = true; }
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp
index f2f136af..e1559210 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp
@@ -1,4 +1,5 @@
 #include "Aperture.h"
+#include "constants.hpp"
 
 using namespace OptixCSP;
 
@@ -28,12 +29,24 @@ double ApertureCircle::get_radius() const {
     return radius; 
 }
 
-double ApertureCircle::get_width() const { 
-    return 2.0 * radius; 
+// double ApertureCircle::get_width() const { 
+//     return 2.0 * radius; 
+// }
+
+// double ApertureCircle::get_height() const { 
+//     return 2.0 * radius; 
+// }
+
+// ApertureHexagon implementations
+ApertureHexagon::ApertureHexagon() : side_length(1.0) {}
+ApertureHexagon::ApertureHexagon(double s) : side_length(s) {}
+
+ApertureType ApertureHexagon::get_aperture_type() const {
+    return ApertureType::HEXAGON;
 }
 
-double ApertureCircle::get_height() const { 
-    return 2.0 * radius; 
+double ApertureHexagon::get_side_length() const {
+    return side_length;
 }
 
 // ApertureRectangleEasy implementations
@@ -77,3 +90,13 @@ Vec3d ApertureQuadrilateral::get_p0() const { return m_p0; }
 Vec3d ApertureQuadrilateral::get_p1() const { return m_p1; }
 Vec3d ApertureQuadrilateral::get_p2() const { return m_p2; }
 Vec3d ApertureQuadrilateral::get_p3() const { return m_p3; }
+
+ApertureAnnulus::ApertureAnnulus() : ri(0.5), ro(1.0), arc(2.0 * SolTrace::Data::PI) {}
+ApertureAnnulus::ApertureAnnulus(double r_inner, double r_outer, double arc)
+  : ri(r_inner), ro(r_outer), arc(arc) {}
+ApertureType ApertureAnnulus::get_aperture_type() const {
+    return ApertureType::ANNULUS;
+}
+double ApertureAnnulus::get_radius_inner() const { return ri; }
+double ApertureAnnulus::get_radius_outer() const { return ro; }
+double ApertureAnnulus::get_arc() const { return arc; }
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h
index 4f6dd9ba..83228d5d 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h
@@ -41,13 +41,27 @@ namespace OptixCSP {
         virtual ApertureType get_aperture_type() const override;
         void set_size(double r);
         virtual double get_radius() const override;
-        virtual double get_width() const override;
-        virtual double get_height() const override;
+        // virtual double get_width() const override;
+        // virtual double get_height() const override;
 
     private:
         double radius;
     };
 
+    // Concrete class for a circular aperture.
+    class ApertureHexagon : public Aperture {
+    public:
+        ApertureHexagon();
+        ApertureHexagon(double s);
+        virtual ~ApertureHexagon() = default;
+
+        virtual ApertureType get_aperture_type() const override;
+        virtual double get_side_length() const;
+
+    private:
+        double side_length;
+    };
+
     // Concrete class for an easy rectangular aperture.
     class ApertureRectangle : public Aperture {
     public:
@@ -114,4 +128,19 @@ namespace OptixCSP {
         Vec3d m_p3;
     };
 
-}
\ No newline at end of file
+    class ApertureAnnulus : public Aperture {
+    public:
+        ApertureAnnulus();
+      ApertureAnnulus(double ri, double ro, double arc);
+        virtual ~ApertureAnnulus() = default;
+        virtual ApertureType get_aperture_type() const override;
+        double get_radius_inner() const;
+        double get_radius_outer() const;
+      double get_arc() const;
+    private:
+        double ri;
+        double ro;
+      double arc; // Measured in radians
+    };
+
+}
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp
index cfbec7fe..7ade3585 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp
@@ -1,6 +1,5 @@
 #include <cstdint>
 #include <limits>
-#include <string>
 #include <vector>
 
 #include "vec3d.h"
@@ -22,6 +21,7 @@ CspElementBase::CspElementBase()
 CspElement::CspElement()
 {
     m_origin = Vec3d(0.0, 0.0, 0.0);
+    m_aim_point = Vec3d(0.0, 0.0, 1.0);
     m_rotation_matrix = Matrix33d();
     m_surface = nullptr;
     m_aperture = nullptr;
@@ -42,6 +42,16 @@ void CspElement::set_origin(const Vec3d &o)
     m_origin = o;
 }
 
+const Vec3d &CspElement::get_aim_point() const
+{
+    return m_aim_point;
+}
+
+void CspElement::set_aim_point(const Vec3d &ap)
+{
+    m_aim_point = ap;
+}
+
 std::shared_ptr<Aperture> CspElement::get_aperture() const
 {
     return m_aperture;
@@ -91,7 +101,7 @@ Matrix33d CspElement::get_rotation_matrix() const
     return m_rotation_matrix;
 }
 
-void CspElement::set_rotation_matrix(const Matrix33d& rotation_matrix)
+void CspElement::set_rotation_matrix(const Matrix33d &rotation_matrix)
 {
     m_rotation_matrix = rotation_matrix;
 }
@@ -218,14 +228,14 @@ GeometryDataST CspElement::toDeviceGeometryData() const
             Vec3d edge_y = v2 * (float)height;
 
             Vec3d local_anchor(x_coord + width, y_coord, 0.0);
-            //float3 anchor = OptixCSP::toFloat3(m_origin - v1 * 0.5 - v2 * 0.5);
+            // float3 anchor = OptixCSP::toFloat3(m_origin - v1 * 0.5 - v2 * 0.5);
             Vec3d global_anchor = rotation_matrix * local_anchor + m_origin;
 
-            GeometryDataST::Rectangle_Parabolic heliostat(OptixCSP::toFloat3(edge_x), 
-                OptixCSP::toFloat3(edge_y), 
-                OptixCSP::toFloat3(global_anchor),
-                (float)m_surface->get_curvature_1(),
-                (float)m_surface->get_curvature_2());
+            GeometryDataST::Rectangle_Parabolic heliostat(OptixCSP::toFloat3(edge_x),
+                                                          OptixCSP::toFloat3(edge_y),
+                                                          OptixCSP::toFloat3(global_anchor),
+                                                          (float)m_surface->get_curvature_1(),
+                                                          (float)m_surface->get_curvature_2());
             geometry_data.setRectangleParabolic(heliostat);
         }
 
@@ -236,13 +246,10 @@ GeometryDataST CspElement::toDeviceGeometryData() const
 
             float3 center = OptixCSP::toFloat3(m_origin);
             Matrix33d rotation_matrix = get_rotation_matrix(); // L2G rotation matrix
-
             float3 base_x = OptixCSP::toFloat3(rotation_matrix.get_x_basis());
-
             float3 base_z = OptixCSP::toFloat3(rotation_matrix.get_z_basis());
 
             GeometryDataST::Cylinder_Y heliostat(center, radius, half_height, base_x, base_z);
-
             geometry_data.setCylinder_Y(heliostat);
         }
     }
@@ -263,7 +270,9 @@ GeometryDataST CspElement::toDeviceGeometryData() const
         Vec3d v2_global = rotation_matrix * v2 + m_origin;
         Vec3d v3_global = rotation_matrix * v3 + m_origin;
 
-        GeometryDataST::Triangle_Flat heliostat(OptixCSP::toFloat3(v1_global), OptixCSP::toFloat3(v2_global), OptixCSP::toFloat3(v3_global));
+        GeometryDataST::Triangle_Flat heliostat(OptixCSP::toFloat3(v1_global),
+                                                OptixCSP::toFloat3(v2_global),
+                                                OptixCSP::toFloat3(v3_global));
         geometry_data.setTriangle_Flat(heliostat);
     }
 
@@ -292,6 +301,47 @@ GeometryDataST CspElement::toDeviceGeometryData() const
         geometry_data.setQuadrilateral_Flat(heliostat);
     }
 
+    if (aperture_type == ApertureType::CIRCLE)
+    {
+        ApertureCircle circ = static_cast<ApertureCircle &>(*m_aperture);
+        float r = circ.get_radius();
+        float3 o = OptixCSP::toFloat3(m_origin);
+        float3 n = normalize(OptixCSP::toFloat3(m_aim_point - m_origin));
+        if (surface_type == SurfaceType::FLAT)
+        {
+            GeometryDataST::Circle_Flat heliostat(o, n, r);
+            geometry_data.setCircle_Flat(heliostat);
+        }
+    }
+
+    if (aperture_type == ApertureType::HEXAGON)
+    {
+        ApertureHexagon hex = static_cast<ApertureHexagon &>(*m_aperture);
+        float s = hex.get_side_length();
+        float3 o = OptixCSP::toFloat3(m_origin);
+        float3 n = normalize(OptixCSP::toFloat3(m_aim_point - m_origin));
+        if (surface_type == SurfaceType::FLAT)
+        {
+            GeometryDataST::Hexagon_Flat hex(o, n, s);
+            geometry_data.setHexagon_Flat(hex);
+        }
+    }
+
+    if (aperture_type == ApertureType::ANNULUS)
+    {
+        ApertureAnnulus anf = static_cast<ApertureAnnulus &>(*m_aperture);
+        float radius_in = anf.get_radius_inner();
+        float radius_out = anf.get_radius_outer();
+        float arc = anf.get_arc();
+        float3 o = OptixCSP::toFloat3(m_origin);
+        float3 n = normalize(OptixCSP::toFloat3(m_aim_point - m_origin));
+        if (surface_type == SurfaceType::FLAT)
+        {
+            GeometryDataST::Annulus_Flat anf(o, n, radius_in, radius_out, arc);
+            geometry_data.setAnnulus_Flat(anf);
+        }
+    }
+
     geometry_data.id = this->m_id;
 
     return geometry_data;
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.h
index 78a0572e..0a89c0cc 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.h
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.h
@@ -26,6 +26,9 @@ namespace OptixCSP
         // Positioning and orientation.
         virtual const Vec3d &get_origin() const = 0;
         virtual void set_origin(const OptixCSP::Vec3d &) = 0;
+      virtual const Vec3d &get_aim_point() const = 0;
+      virtual void set_aim_point(const Vec3d &o) = 0;
+
         // virtual const Vec3d& get_euler_angles() const = 0;
         // virtual void set_euler_angles(const Vec3d&) = 0;
 
@@ -55,6 +58,9 @@ namespace OptixCSP
         const Vec3d &get_origin() const override;
         void set_origin(const Vec3d &o) override;
 
+      const Vec3d &get_aim_point() const override;
+      void set_aim_point(const Vec3d &o) override;
+
         std::shared_ptr<Aperture> get_aperture() const;
         std::shared_ptr<Surface> get_surface() const;
         ApertureType get_aperture_type() const;
@@ -106,6 +112,7 @@ namespace OptixCSP
                         const OpticalDistribution od);
 
         Vec3d m_origin;
+        Vec3d m_aim_point;
         Matrix33d m_rotation_matrix;
 
         Vec3d m_upper_box_bound; // Global coordinates
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp
index 9baf1cfe..9ac1344b 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp
@@ -1,10 +1,7 @@
 #include "data_manager.h"
 #include "soltrace_system.h"
 #include "utils/util_check.hpp"
-#include <fstream>
-#include <sstream>
 #include <stdexcept>
-#include <chrono>
 #include <iostream>
 #include <iomanip>
 #include "sun_utils.h"
@@ -20,7 +17,8 @@ dataManager::dataManager()
 	  sun_user_intensity_D(nullptr),
 	  sun_user_capacity(0),
 	  rng_states_D(nullptr),
-	  rng_states_capacity(0)
+	  rng_states_capacity(0),
+	  depth_exceeded_count_D(nullptr)
 {
 
 	// Initialize launch parameters with default values
@@ -29,11 +27,12 @@ dataManager::dataManager()
 	launch_params_H.max_depth = 5;
 	launch_params_H.ray_offset = 0;
 
-	launch_params_H.hit_point_buffer = nullptr;
+	// launch_params_H.hit_point_buffer = nullptr;
+	launch_params_H.hit_buffer = nullptr;
 	launch_params_H.sun_dir_buffer = nullptr;
 	launch_params_H.rng_states = nullptr;
-	launch_params_H.element_id_buffer = nullptr;
-	launch_params_H.hit_type_buffer = nullptr;
+	// launch_params_H.element_id_buffer = nullptr;
+	// launch_params_H.hit_type_buffer = nullptr;
 	launch_params_H.sun_vector = make_float3(0.0f, 0.0f, 10.0f);
 	launch_params_H.sun_shape = OptixCSP::SunShape::UNKNOWN;
 	launch_params_H.include_sun_shape_errors = false;
@@ -58,6 +57,8 @@ dataManager::dataManager()
 	launch_params_H.material_data_array_back = nullptr;
 	launch_params_H.sun_dir_seed = 0ULL;
 
+	launch_params_H.d_depth_exceeded_count = nullptr;
+
 	launch_params_H.geometry_data_array = nullptr;
 	launch_params_H.handle = OptixTraversableHandle{};
 
@@ -73,6 +74,11 @@ OptixCSP::LaunchParams *dataManager::getDeviceLaunchParams() const { return laun
 
 void dataManager::allocateLaunchParams()
 {
+	if (launch_params_D)
+	{
+		CUDA_CHECK(cudaFree(launch_params_D));
+		launch_params_D = nullptr;
+	}
 	CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&launch_params_D), sizeof(LaunchParams)));
 }
 
@@ -109,8 +115,23 @@ void dataManager::ensureCurandStates(
 	launch_params_H.rng_states = rng_states_D;
 }
 
+void dataManager::ensureDepthExceededCounter()
+{
+	if (depth_exceeded_count_D == nullptr)
+	{
+		CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&depth_exceeded_count_D), sizeof(uint64_t)));
+		CUDA_CHECK(cudaMemset(depth_exceeded_count_D, 0, sizeof(uint64_t)));
+		launch_params_H.d_depth_exceeded_count = depth_exceeded_count_D;
+	}
+}
+
 void dataManager::allocateGeometryDataArray(std::vector<GeometryDataST> geometry_data_array_H)
 {
+	if (geometry_data_array_D)
+	{
+		CUDA_CHECK(cudaFree(geometry_data_array_D));
+		geometry_data_array_D = nullptr;
+	}
 
 	CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&geometry_data_array_D),
 						  geometry_data_array_H.size() * sizeof(GeometryDataST)));
@@ -138,6 +159,16 @@ void dataManager::updateGeometryDataArray(std::vector<GeometryDataST> geometry_d
 void dataManager::allocateMaterialDataArray(std::vector<MaterialData> material_data_array_front_H,
 											std::vector<MaterialData> material_data_array_back_H)
 {
+	if (material_data_array_front_D)
+	{
+		CUDA_CHECK(cudaFree(material_data_array_front_D));
+		material_data_array_front_D = nullptr;
+	}
+	if (material_data_array_back_D)
+	{
+		CUDA_CHECK(cudaFree(material_data_array_back_D));
+		material_data_array_back_D = nullptr;
+	}
 
 	CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&material_data_array_front_D),
 						  material_data_array_front_H.size() * sizeof(MaterialData)));
@@ -260,5 +291,11 @@ void dataManager::cleanup() {
 	}
 	launch_params_H.rng_states = nullptr;
 
+	if (depth_exceeded_count_D != nullptr) {
+		CUDA_CHECK(cudaFree(depth_exceeded_count_D));
+		depth_exceeded_count_D = nullptr;
+	}
+	launch_params_H.d_depth_exceeded_count = nullptr;
+
 	launch_params_H.handle = OptixTraversableHandle{};
 }
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.h
index 7922d132..9d59219e 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.h
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.h
@@ -29,6 +29,9 @@ namespace OptixCSP
         curandState *rng_states_D;
         size_t rng_states_capacity;
 
+        // Device counter: rays terminated by max depth (not absorption)
+        uint64_t *depth_exceeded_count_D;
+
         float *sun_user_angle_D;
         float *sun_user_intensity_D;
         size_t sun_user_capacity;
@@ -68,5 +71,8 @@ namespace OptixCSP
                                 unsigned long long seed,
                                 unsigned int sequence_offset,
                                 cudaStream_t stream);
+
+        /// Allocates the depth-exceeded counter on the device if not already allocated.
+        void ensureDepthExceededCounter();
     };
 }
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp
index 8369ecb0..118f69b9 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp
@@ -3,7 +3,6 @@
 #include "sun_utils.h"
 #include "soltrace_state.h"
 #include "utils/util_check.hpp"
-#include "data_manager.h"
 
 #include <optix_stubs.h>
 
@@ -41,9 +40,56 @@ void GeometryManager::collect_geometry_info(const std::vector<std::shared_ptr<Cs
         OptixAabb aabb;
         uint32_t sbt_offset = 0;
 
-        // if (element->get_aperture_type() == ApertureType::CIRCLE)
-        // {
-        // }
+        if (element->get_aperture_type() == ApertureType::ANNULUS)
+        {
+            if (element->get_surface_type() == SurfaceType::FLAT)
+            {
+                sbt_offset = static_cast<uint32_t>(OpticalEntityType::ANNULUS_FLAT);
+            }
+            else
+            {
+                std::stringstream ss;
+                ss << "Unimplemented surface type ("
+                   << static_cast<int>(element->get_surface_type())
+                   << ") for annular aperture ("
+                   << static_cast<int>(element->get_aperture_type());
+                throw std::runtime_error(ss.str());
+            }
+        }
+
+        if (element->get_aperture_type() == ApertureType::CIRCLE)
+        {
+            if (element->get_surface_type() == SurfaceType::FLAT)
+            {
+                sbt_offset = static_cast<uint32_t>(OpticalEntityType::CIRCLE_FLAT);
+            }
+            else
+            {
+                std::stringstream ss;
+                ss << "Unimplemented surface type ("
+                   << static_cast<int>(element->get_surface_type())
+                   << ") for circular aperture ("
+                   << static_cast<int>(element->get_aperture_type());
+                throw std::runtime_error(ss.str());
+            }
+        }
+
+        if (element->get_aperture_type() == ApertureType::HEXAGON)
+        {
+            if (element->get_surface_type() == SurfaceType::FLAT)
+            {
+                sbt_offset = static_cast<uint32_t>(OpticalEntityType::HEXAGON_FLAT);
+            }
+            else
+            {
+                std::stringstream ss;
+                ss << "Unimplemented surface type ("
+                   << static_cast<int>(element->get_surface_type())
+                   << ") for hexagon aperture ("
+                   << static_cast<int>(element->get_aperture_type());
+                throw std::runtime_error(ss.str());
+            }
+        }
 
         if (element->get_aperture_type() == ApertureType::RECTANGLE)
         {
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp
index 9985b17c..ade2345d 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp
@@ -42,7 +42,10 @@ const std::map<OpticalEntityType, std::string> IntersectionKernelMap = {
     {OpticalEntityType::RECTANGLE_FLAT, "__intersection__rectangle_flat"},
     {OpticalEntityType::TRIANGLE_FLAT, "__intersection__triangle_flat"},
     {OpticalEntityType::CYLINDRICAL, "__intersection__cylinder_y"},
-    {OpticalEntityType::QUADRILATERAL_FLAT, "__intersection__quadrilateral_flat"}};
+    {OpticalEntityType::QUADRILATERAL_FLAT, "__intersection__quadrilateral_flat"},
+    {OpticalEntityType::CIRCLE_FLAT, "__intersection__circle_flat"},
+    {OpticalEntityType::HEXAGON_FLAT, "__intersection__hexagon_flat"},
+    {OpticalEntityType::ANNULUS_FLAT, "__intersection__annulus_flat"}};
 
 pipelineManager::pipelineManager(SoltraceState &state) : m_state(state) {}
 
@@ -115,6 +118,7 @@ void pipelineManager::loadModules()
     // Geometry module.
     {
         std::string ptx = loadPtxFromFile("intersection");
+        LOG_SIZE = sizeof(LOG);
         OPTIX_CHECK(optixModuleCreate(
             m_state.context,
             &moduleCompileOptions,
@@ -127,6 +131,7 @@ void pipelineManager::loadModules()
     // Shading/materials module.
     {
         std::string ptx = loadPtxFromFile("materials");
+        LOG_SIZE = sizeof(LOG);
         OPTIX_CHECK(optixModuleCreate(
             m_state.context,
             &moduleCompileOptions,
@@ -139,6 +144,7 @@ void pipelineManager::loadModules()
     // Sun module.
     {
         std::string ptx = loadPtxFromFile("sun");
+        LOG_SIZE = sizeof(LOG);
         OPTIX_CHECK(optixModuleCreate(
             m_state.context,
             &moduleCompileOptions,
@@ -169,10 +175,10 @@ void pipelineManager::createPipeline()
 
     // Link program groups to pipeline
     OptixPipelineLinkOptions pipeline_link_options = {};
-    // TODO max trace belong to who?
-    pipeline_link_options.maxTraceDepth = MAX_TRACE_DEPTH; // Maximum recursion depth for ray tracing.
+    pipeline_link_options.maxTraceDepth = m_max_trace_depth; // Maximum recursion depth for ray tracing.
 
     // Create the OptiX pipeline by linking the program groups.
+    LOG_SIZE = sizeof(LOG);
     OPTIX_CHECK(optixPipelineCreate(
         m_state.context,                                    // OptiX context.
         &m_state.pipeline_compile_options,                  // Compile options for the pipeline.
@@ -197,7 +203,7 @@ void pipelineManager::createPipeline()
     // Compute stack sizes based on the maximum trace depth and other settings.
     OPTIX_CHECK(optixUtilComputeStackSizes(
         &stack_sizes,                               // Input stack sizes.
-        MAX_TRACE_DEPTH,                            // Maximum trace depth.
+        m_max_trace_depth,                          // Maximum trace depth.
         0,                                          // maxCCDepth: Maximum depth of continuation callables (none in this case).
         0,                                          // maxDCDepth: Maximum depth of direct callables (none in this case).
         &direct_callable_stack_size_from_traversal, // Output: Stack size for callable traversal.
@@ -233,6 +239,7 @@ void pipelineManager::createHitGroupProgram(OptixProgramGroup &group,
     desc.hitgroup.moduleAH = nullptr;
     desc.hitgroup.entryFunctionNameAH = nullptr;
 
+    LOG_SIZE = sizeof(LOG);
     OPTIX_CHECK(optixProgramGroupCreate(
         m_state.context,
         &desc,
@@ -256,7 +263,12 @@ void pipelineManager::createSunProgram()
     desc.raygen.entryFunctionName = "__raygen__sun_source";
 
     // Create the program group
-    OPTIX_CHECK_LOG(optixProgramGroupCreate(
+    // Note: OPTIX_CHECK_LOG is not used here because the macro creates its own
+    // local LOG_ buffer (not the global LOG), causing it to always print 2048
+    // null bytes to stderr. Instead we use OPTIX_CHECK and manually print any
+    // non-empty log content when verbose mode is enabled.
+    LOG_SIZE = sizeof(LOG);
+    OPTIX_CHECK(optixProgramGroupCreate(
         m_state.context, // OptiX context.
         &desc,           // Descriptor defining the program group.
         1,               // Number of program groups to create (1 in this case).
@@ -264,6 +276,11 @@ void pipelineManager::createSunProgram()
         LOG, &LOG_SIZE,  // Logs to capture diagnostic information.
         &group           // Output: Handle for the created program group.
         ));
+    if (LOG_SIZE > 1 && LOG[0] != '\0')
+    {
+        std::cerr << "OptiX log for optixProgramGroupCreate (sun):\n"
+                  << std::string(LOG, LOG + LOG_SIZE) << std::endl;
+    }
 
     m_program_groups.push_back(group);
     m_state.raygen_prog_group = group;
@@ -333,6 +350,7 @@ void pipelineManager::createMissProgram()
     desc.miss.entryFunctionName = "__miss__ms";
 
     // Create the program grou
+    LOG_SIZE = sizeof(LOG);
     OPTIX_CHECK(optixProgramGroupCreate(
         m_state.context,
         &desc,
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.h
index 0d77791b..510b39f8 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.h
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.h
@@ -34,6 +34,8 @@ namespace OptixCSP
 
         void set_verbose(bool verbose) { m_verbose = verbose; }
 
+        void set_max_trace_depth(uint8_t depth) { m_max_trace_depth = depth; }
+
         /**
          * @brief Loads PTX code from a file.
          * @param kernelName The name of the PTX kernel file to load.
@@ -112,6 +114,7 @@ namespace OptixCSP
         std::vector<OptixProgramGroup> m_program_groups;                      ///< Stores all created OptiX program groups.
         std::map<OpticalEntityType, size_t> m_intersection_program_group_map; ///< Map surface-aperture combinations to index in m_program_groups
         bool m_verbose = false;
+        uint8_t m_max_trace_depth = DEFAULT_MAX_TRACE_DEPTH;
 
         // // Number of program groups categorized by type.
         // int num_raygen_programs = 1; ///< Number of ray generation programs.
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu
new file mode 100644
index 00000000..a685e2c2
--- /dev/null
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu
@@ -0,0 +1,276 @@
+#include "ray_utils.h"
+
+#include "shaders/Soltrace.h"
+#include "utils/util_check.hpp"
+
+#include <cuda_runtime.h>
+#include <cub/cub.cuh>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <chrono>
+#include <cstdint>
+#include <vector>
+
+namespace OptixCSP
+{
+
+    // ---------------------------------------------------------------------------
+    // Pass 1 – count output records per ray.
+    //
+    // For each ray the loop reads up to max_depth HitRecord entries.
+    // raw_count accumulates every valid record (HIT_CREATE through HIT_EXIT).
+    // A ray with only a HIT_CREATE event (raw_count == 1) contributes 0 output
+    // records (it missed all elements).  Any ray with raw_count > 1 contributes
+    // all raw_count records (CREATE + one or more hits).
+    // ---------------------------------------------------------------------------
+    __global__ static void count_ray_outputs(
+        const HitRecord *__restrict__ hit_buffer,
+        uint32_t num_rays,
+        uint32_t max_depth,
+        uint8_t  *__restrict__ out_record_count,
+        uint8_t  *__restrict__ out_has_hit)
+    {
+        const uint32_t ray = blockIdx.x * blockDim.x + threadIdx.x;
+        if (ray >= num_rays)
+            return;
+
+        uint32_t raw_count = 0;
+        for (uint32_t depth = 0; depth < max_depth; ++depth)
+        {
+            const uint8_t ht = hit_buffer[max_depth * ray + depth].hit_type;
+            if (ht < HIT_CREATE || ht > HIT_EXIT)
+                break;
+            ++raw_count;
+            if (ht == HIT_ABSORB || ht == HIT_EXIT)
+                break;
+        }
+
+        const uint8_t has_hit = (raw_count > 1) ? 1u : 0u;
+        out_record_count[ray] = has_hit ? static_cast<uint8_t>(raw_count) : 0u;
+        out_has_hit[ray] = has_hit;
+
+        return;
+    }
+
+    // ---------------------------------------------------------------------------
+    // Pass 2 – write compacted records.
+    //
+    // Each thread handles one ray.  Rays whose entry in the exclusive prefix-sum
+    // equals that of the next ray (i.e. record_count was 0) write nothing.
+    // For qualifying rays the CREATE record is written first, followed by all
+    // subsequent hit records, using the pre-computed offset as the base index.
+    // ---------------------------------------------------------------------------
+    __global__ static void compact_ray_outputs(
+        const HitRecord *__restrict__ hit_buffer,
+        uint32_t num_rays,
+        uint32_t max_depth,
+        const uint64_t *__restrict__ offsets,
+        const uint8_t  *__restrict__ has_hit,
+        HitRecord *__restrict__ out_buffer)
+    {
+        const uint32_t ray = blockIdx.x * blockDim.x + threadIdx.x;
+        if (ray >= num_rays || !has_hit[ray])
+            return;
+
+        const HitRecord *ray_base = hit_buffer + max_depth * ray;
+        uint64_t out_idx = offsets[ray];
+
+        // Depth 0 is always HIT_CREATE for qualifying rays
+        out_buffer[out_idx++] = ray_base[0];
+
+        for (uint32_t depth = 1; depth < max_depth; ++depth)
+        {
+            const HitRecord &hr = ray_base[depth];
+            const uint8_t ht = hr.hit_type;
+
+            if (ht < HIT_CREATE || ht > HIT_EXIT)
+                break;
+
+            out_buffer[out_idx++] = hr;
+            if (ht == HIT_ABSORB || ht == HIT_EXIT)
+                break;
+        }
+
+        return;
+    }
+
+    // ---------------------------------------------------------------------------
+    // Host-callable scratch management
+    // ---------------------------------------------------------------------------
+    void allocate_compaction_scratch(CompactionScratch &scratch, uint64_t num_rays, uint64_t max_depth)
+    {
+        free_compaction_scratch(scratch);
+
+        CUDA_CHECK(cudaMalloc(&scratch.d_count, num_rays * sizeof(uint8_t)));
+        CUDA_CHECK(cudaMalloc(&scratch.d_offsets, num_rays * sizeof(uint64_t)));
+        CUDA_CHECK(cudaMalloc(&scratch.d_has_hit, num_rays * sizeof(uint8_t)));
+        CUDA_CHECK(cudaMalloc(&scratch.d_n_hit, sizeof(uint32_t)));
+
+        // Query CUB temp-storage sizes using typed null pointers (size query only).
+        // scan_bytes must cover both ExclusiveSum and DeviceSelect::Flagged (d_scan_tmp is reused).
+        uint32_t *null_u32 = nullptr;
+        uint8_t  *null_u8  = nullptr;
+        uint64_t *null_u64 = nullptr;
+        cub::DeviceScan::ExclusiveSum(scratch.d_scan_tmp, scratch.scan_bytes, null_u8, null_u64, num_rays);
+        cub::DeviceReduce::Sum(scratch.d_red_tmp, scratch.red_bytes, null_u8, null_u32, num_rays);
+
+        size_t select_bytes = 0;
+        thrust::counting_iterator<uint64_t> count_iter(0ull);
+        cub::DeviceSelect::Flagged(nullptr, select_bytes, count_iter, null_u8, null_u64, null_u32, num_rays);
+        if (select_bytes > scratch.scan_bytes)
+            scratch.scan_bytes = select_bytes;
+
+        CUDA_CHECK(cudaMalloc(&scratch.d_scan_tmp, scratch.scan_bytes > 0 ? scratch.scan_bytes : 1));
+        CUDA_CHECK(cudaMalloc(&scratch.d_red_tmp, scratch.red_bytes > 0 ? scratch.red_bytes : 1));
+
+        // Worst-case compacted output: every slot in the hit buffer could be kept
+        CUDA_CHECK(cudaMalloc(&scratch.d_compacted, num_rays * max_depth * sizeof(HitRecord)));
+
+        // CUDA events for GPU-phase timing
+        CUDA_CHECK(cudaEventCreate(&scratch.e_gpu1_start));
+        CUDA_CHECK(cudaEventCreate(&scratch.e_gpu1_stop));
+        CUDA_CHECK(cudaEventCreate(&scratch.e_gpu2_start));
+        CUDA_CHECK(cudaEventCreate(&scratch.e_gpu2_stop));
+    }
+
+    void free_compaction_scratch(CompactionScratch &scratch)
+    {
+        // cudaFree is nullptr-safe
+        cudaFree(scratch.d_count);
+        cudaFree(scratch.d_offsets);
+        cudaFree(scratch.d_has_hit);
+        cudaFree(scratch.d_n_hit);
+        cudaFree(scratch.d_scan_tmp);
+        cudaFree(scratch.d_red_tmp);
+        cudaFree(scratch.d_compacted);
+        // cudaEventDestroy is not nullptr-safe
+        if (scratch.e_gpu1_start) cudaEventDestroy(scratch.e_gpu1_start);
+        if (scratch.e_gpu1_stop)  cudaEventDestroy(scratch.e_gpu1_stop);
+        if (scratch.e_gpu2_start) cudaEventDestroy(scratch.e_gpu2_start);
+        if (scratch.e_gpu2_stop)  cudaEventDestroy(scratch.e_gpu2_stop);
+        scratch = CompactionScratch{};
+    }
+
+    // ---------------------------------------------------------------------------
+    // Host-callable orchestrator
+    // ---------------------------------------------------------------------------
+    uint32_t gpu_compact_hit_buffer(
+        const HitRecord *d_hit_buffer,
+        uint32_t num_rays,
+        uint32_t max_depth,
+        uint64_t ray_offset,
+        std::vector<HitRecord> &host_out,
+        std::vector<uint64_t> &host_ray_ids,
+        cudaStream_t stream,
+        CompactionScratch &scratch,
+        CompactionTimings *timings)
+    {
+        if (num_rays == 0)
+            return 0;
+
+        // ---- Pass 1: count records per ray ----
+        const uint32_t block_size = 256;
+        const uint32_t grid_size = (num_rays + block_size - 1) / block_size;
+
+        if (timings) CUDA_CHECK(cudaEventRecord(scratch.e_gpu1_start, stream));
+
+        count_ray_outputs<<<grid_size, block_size, 0, stream>>>(
+            d_hit_buffer, num_rays, max_depth, scratch.d_count, scratch.d_has_hit);
+
+        // ---- Exclusive prefix-sum: d_count → d_offsets ----
+        cub::DeviceScan::ExclusiveSum(scratch.d_scan_tmp, scratch.scan_bytes, scratch.d_count, scratch.d_offsets, num_rays, stream);
+
+        // ---- Reduce: sum(d_has_hit) → d_n_hit ----
+        cub::DeviceReduce::Sum(scratch.d_red_tmp, scratch.red_bytes, scratch.d_has_hit, scratch.d_n_hit, num_rays, stream);
+
+        if (timings) CUDA_CHECK(cudaEventRecord(scratch.e_gpu1_stop, stream));
+
+        // ---- Synchronize to read back scalar results ----
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        if (timings)
+        {
+            float ms = 0.f;
+            CUDA_CHECK(cudaEventElapsedTime(&ms, scratch.e_gpu1_start, scratch.e_gpu1_stop));
+            timings->gpu_phase1_ms += ms;
+        }
+
+        // ---- D→H scalar memcpy (CPU wall-clock) ----
+        std::chrono::high_resolution_clock::time_point t_scalar;
+        if (timings) t_scalar = std::chrono::high_resolution_clock::now();
+
+        uint64_t last_offset = 0;
+        uint8_t  last_count = 0;
+        uint32_t n_hit_rays = 0;
+        CUDA_CHECK(cudaMemcpy(&last_offset, scratch.d_offsets + (num_rays - 1), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpy(&last_count, scratch.d_count + (num_rays - 1), sizeof(uint8_t), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpy(&n_hit_rays, scratch.d_n_hit, sizeof(uint32_t), cudaMemcpyDeviceToHost));
+
+        if (timings)
+            timings->scalar_dth_ms += std::chrono::duration<float, std::milli>(
+                std::chrono::high_resolution_clock::now() - t_scalar).count();
+
+        const uint64_t total_records = last_offset + last_count;
+
+        if (total_records > 0)
+        {
+            // ---- Pass 2: write compacted HitRecords to pre-allocated device buffer ----
+            if (timings) CUDA_CHECK(cudaEventRecord(scratch.e_gpu2_start, stream));
+
+            compact_ray_outputs<<<grid_size, block_size, 0, stream>>>(
+                d_hit_buffer, num_rays, max_depth, scratch.d_offsets, scratch.d_has_hit, scratch.d_compacted);
+
+            // ---- Compact global ray IDs into d_offsets (reused after pass 2) ----
+            // DeviceSelect::Flagged selects (ray_offset + i) for each i where d_has_hit[i] == 1.
+            // d_scan_tmp and d_offsets are both free after compact_ray_outputs completes.
+            thrust::counting_iterator<uint64_t> ray_id_iter(ray_offset);
+            cub::DeviceSelect::Flagged(
+                scratch.d_scan_tmp, scratch.scan_bytes,
+                ray_id_iter, scratch.d_has_hit,
+                scratch.d_offsets,   // output: global IDs of hit rays (uint64_t)
+                scratch.d_n_hit,     // output count (already read; safe to overwrite)
+                num_rays, stream);
+
+            if (timings) CUDA_CHECK(cudaEventRecord(scratch.e_gpu2_stop, stream));
+
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+
+            if (timings)
+            {
+                float ms = 0.f;
+                CUDA_CHECK(cudaEventElapsedTime(&ms, scratch.e_gpu2_start, scratch.e_gpu2_stop));
+                timings->gpu_phase2_ms += ms;
+            }
+
+            // ---- D→H bulk memcpy (CPU wall-clock) ----
+            std::chrono::high_resolution_clock::time_point t_bulk;
+            if (timings) t_bulk = std::chrono::high_resolution_clock::now();
+
+            // ---- Copy compacted HitRecords to host ----
+            const size_t prev_rec = host_out.size();
+            host_out.resize(prev_rec + total_records);
+            CUDA_CHECK(cudaMemcpy(
+                host_out.data() + prev_rec,
+                scratch.d_compacted,
+                total_records * sizeof(HitRecord),
+                cudaMemcpyDeviceToHost));
+
+            // ---- Copy global ray IDs to host (one per logical hit ray) ----
+            const size_t prev_ids = host_ray_ids.size();
+            host_ray_ids.resize(prev_ids + n_hit_rays);
+            CUDA_CHECK(cudaMemcpy(
+                host_ray_ids.data() + prev_ids,
+                scratch.d_offsets,
+                n_hit_rays * sizeof(uint64_t),
+                cudaMemcpyDeviceToHost));
+
+            if (timings)
+                timings->bulk_dth_ms += std::chrono::duration<float, std::milli>(
+                    std::chrono::high_resolution_clock::now() - t_bulk).count();
+        }
+
+        if (timings) ++timings->n_calls;
+        return n_hit_rays;
+    }
+
+} // namespace OptixCSP
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h
new file mode 100644
index 00000000..06f945b4
--- /dev/null
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <vector>
+
+namespace OptixCSP
+{
+
+    struct HitRecord;
+
+    /// Device scratch buffers required by gpu_compact_hit_buffer.
+    /// Allocated once via allocate_compaction_scratch and reused across calls
+    /// as long as num_rays and max_depth stay the same.
+    struct CompactionScratch
+    {
+        uint8_t  *d_count = nullptr;   // per-ray output record count (bounded by max_depth ≤ 255)
+        uint64_t *d_offsets = nullptr; // exclusive prefix-sum of d_count (pass 1); reused as global ray IDs (pass 2)
+        uint8_t  *d_has_hit = nullptr; // 1 if ray contributes records, else 0
+        uint32_t *d_n_hit = nullptr;   // scalar: total hit rays
+        void *d_scan_tmp = nullptr;    // CUB DeviceScan temp storage
+        size_t scan_bytes = 0;
+        void *d_red_tmp = nullptr; // CUB DeviceReduce temp storage
+        size_t red_bytes = 0;
+        HitRecord *d_compacted = nullptr; // worst-case compacted output (num_rays * max_depth)
+
+        // CUDA events for GPU-phase timing (non-null after allocate_compaction_scratch).
+        cudaEvent_t e_gpu1_start = nullptr; // before count/scan/reduce kernels
+        cudaEvent_t e_gpu1_stop  = nullptr; // after  count/scan/reduce kernels
+        cudaEvent_t e_gpu2_start = nullptr; // before compact/select kernels
+        cudaEvent_t e_gpu2_stop  = nullptr; // after  compact/select kernels
+    };
+
+    /// Per-call timing breakdown populated by gpu_compact_hit_buffer.
+    /// All times are accumulated (ms) across calls.  Reset to {} at the start of each run().
+    struct CompactionTimings
+    {
+        float    gpu_phase1_ms = 0.f; // count + scan + reduce kernels  (GPU time via CUDA events)
+        float    scalar_dth_ms = 0.f; // 3x scalar D->H cudaMemcpy  (CPU wall-clock)
+        float    gpu_phase2_ms = 0.f; // compact + select kernels       (GPU time via CUDA events)
+        float    bulk_dth_ms   = 0.f; // HitRecord + ray-ID bulk D\u2192H (CPU wall-clock)
+        uint32_t n_calls       = 0;   // total gpu_compact_hit_buffer invocations
+    };
+
+    /// Allocate all device scratch buffers for the given ray-buffer dimensions.
+    /// Frees any previous allocation before reallocating.
+    void allocate_compaction_scratch(CompactionScratch &scratch, uint64_t num_rays, uint64_t max_depth);
+
+    /// Free all device scratch buffers and reset the struct to its default state.
+    void free_compaction_scratch(CompactionScratch &scratch);
+
+    /// GPU-side stream compaction of the raw hit buffer.
+    /// Uses pre-allocated scratch buffers — no device allocations occur inside this call.
+    /// Appends compacted HitRecords to @p host_out and the corresponding global ray indices
+    /// (ray_offset + local_ray_index) to @p host_ray_ids (one entry per logical hit ray).
+    /// Pass a non-null @p timings to accumulate per-phase timing (GPU events + CPU wall-clock).
+    /// @returns Number of rays that produced at least one non-CREATE hit.
+    uint32_t gpu_compact_hit_buffer(
+        const HitRecord *d_hit_buffer,
+        uint32_t num_rays,
+        uint32_t max_depth,
+        uint64_t ray_offset,
+        std::vector<HitRecord> &host_out,
+        std::vector<uint64_t> &host_ray_ids,
+        cudaStream_t stream,
+        CompactionScratch &scratch,
+        CompactionTimings *timings = nullptr);
+
+} // namespace OptixCSP
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp
index 12301558..ac1840a8 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp
@@ -1,21 +1,28 @@
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+
 #include "soltrace_system.h"
-#include "geometry_manager.h"
+#include "ray_utils.h"
+
+#include "CspElement.h"
 #include "data_manager.h"
+#include "geometry_manager.h"
 #include "pipeline_manager.h"
+#include "soltrace_constants.h"
 #include "soltrace_type.h"
-#include "CspElement.h"
 #include "timer.h"
-#include "soltrace_constants.h"
-#include "../../../../../simulation_data/simdata_io.hpp"
-#include "../../../../../simulation_data/solar_position_calculators/basic_sun_position.hpp"
+
+#include "shaders/Soltrace.h"
 
 #include "utils/util_record.hpp"
 #include "utils/util_check.hpp"
 #include "utils/math_util.h"
-#include <fstream>
+
+#include <algorithm>
 #include <iostream>
 #include <iomanip>
-#include <cstring>
+#include <limits>
 
 #include <optix_function_table_definition.h>
 #include <optix_stubs.h>
@@ -28,54 +35,30 @@ using namespace OptixCSP;
 // note that this is has to be per optical entity type.
 typedef Record<OptixCSP::HitGroupData> HitGroupRecord;
 
-void SolTraceSystem::set_verbose(bool verbose)
-{
-    m_verbose = verbose;
-    geometry_manager->set_verbose(verbose);
-    pipeline_manager->set_verbose(verbose);
-}
-
-void SolTraceSystem::print_launch_params()
-{
-    if (!m_verbose)
-    {
-        return;
-    }
-
-    LaunchParams params = data_manager->launch_params_H;
-
-    float3 sun_box_a = params.sun_v0 - params.sun_v1;
-    float3 sun_box_b = params.sun_v1 - params.sun_v2;
-
-    float sun_box_edge_a = sqrtf(sun_box_a.x * sun_box_a.x + sun_box_a.y * sun_box_a.y + sun_box_a.z * sun_box_a.z);
-    float sun_box_edge_b = sqrtf(sun_box_b.x * sun_box_b.x + sun_box_b.y * sun_box_b.y + sun_box_b.z * sun_box_b.z);
-
-    std::cout << "print launch params: " << std::endl;
-    std::cout << "width              : " << params.width << std::endl;
-    std::cout << "height             : " << params.height << std::endl;
-    std::cout << "max_depth          : " << params.max_depth << std::endl;
-    std::cout << "hit_point_buffer   : " << params.hit_point_buffer << std::endl;
-    std::cout << "sun_dir_buffer     : " << params.sun_dir_buffer << std::endl;
-    std::cout << "sun_vector         : " << params.sun_vector.x << " " << params.sun_vector.y << " " << params.sun_vector.z << std::endl;
-    // std::cout << "max_sun_angle      : " << params.max_sun_angle << std::endl;
-    std::cout << "sun_v0             : " << params.sun_v0.x << " " << params.sun_v0.y << " " << params.sun_v0.z << std::endl;
-    std::cout << "sun_v1             : " << params.sun_v1.x << " " << params.sun_v1.y << " " << params.sun_v1.z << std::endl;
-    std::cout << "sun_v2             : " << params.sun_v2.x << " " << params.sun_v2.y << " " << params.sun_v2.z << std::endl;
-    std::cout << "sun_v3             : " << params.sun_v3.x << " " << params.sun_v3.y << " " << params.sun_v3.z << std::endl;
-    std::cout << "sun_box_edge_a     : " << sun_box_edge_a << std::endl;
-    std::cout << "sun_box_edge_b     : " << sun_box_edge_b << std::endl;
-}
-
 SolTraceSystem::SolTraceSystem()
     : m_number_of_rays(0),
       m_max_number_of_rays(0),
+      m_batch_size(0),
+      m_max_ray_depth(DEFAULT_MAX_TRACE_DEPTH),
       m_verbose(false),
       m_mem_free_before(0),
       m_mem_free_after(0),
       m_optical_errors(false),
+      m_n_hit_rays(0),
+      m_n_sun_rays(0),
+      m_n_depth_exceeded_rays(0),
       m_include_sun_shape_errors(false),
       m_timer_setup(),
       m_timer_trace(),
+      m_timer_aabb(),
+      m_timer_geometry(),
+      m_timer_pipeline(),
+      m_timer_sbt(),
+      m_timer_setup_buffer(),
+      m_timer_optix_launch(),
+      m_timer_collect_results(),
+      m_n_run_iterations(0),
+      m_mem_free_post_setup(0),
       geometry_manager(std::make_shared<GeometryManager>(m_state, m_verbose)),
       data_manager(std::make_shared<dataManager>()),
       pipeline_manager(std::make_shared<pipelineManager>(m_state)),
@@ -87,9 +70,9 @@ SolTraceSystem::SolTraceSystem()
     if (m_verbose)
     {
         std::cout << "Using OPTIX Version: " << major
-            << "." << minor
-            << "." << micro
-            << std::endl;
+                  << "." << minor
+                  << "." << micro
+                  << std::endl;
     }
 
     CUDA_CHECK(cudaFree(0));
@@ -109,6 +92,63 @@ SolTraceSystem::SolTraceSystem()
 
 SolTraceSystem::~SolTraceSystem()
 {
+    clean_up();
+}
+
+void SolTraceSystem::set_max_ray_depth(uint64_t depth)
+{
+    if (depth < 2)
+    {
+        std::cerr << "[OptixRunner] WARNING: max_ray_depth (" << depth
+                  << ") is below the minimum of 2. Clamping to 2.\n";
+        depth = 2;
+    }
+    else if (depth > 255)
+    {
+        std::cerr << "[OptixRunner] WARNING: max_ray_depth (" << depth
+                  << ") exceeds the maximum of 255. Clamping to 255.\n";
+        depth = 255;
+    }
+    m_max_ray_depth = depth;
+}
+
+void SolTraceSystem::set_verbose(bool verbose)
+{
+    m_verbose = verbose;
+    geometry_manager->set_verbose(verbose);
+    pipeline_manager->set_verbose(verbose);
+}
+
+void SolTraceSystem::print_launch_params()
+{
+    if (!m_verbose)
+    {
+        return;
+    }
+
+    LaunchParams params = data_manager->launch_params_H;
+
+    float3 sun_box_a = params.sun_v0 - params.sun_v1;
+    float3 sun_box_b = params.sun_v1 - params.sun_v2;
+
+    float sun_box_edge_a = sqrtf(sun_box_a.x * sun_box_a.x + sun_box_a.y * sun_box_a.y + sun_box_a.z * sun_box_a.z);
+    float sun_box_edge_b = sqrtf(sun_box_b.x * sun_box_b.x + sun_box_b.y * sun_box_b.y + sun_box_b.z * sun_box_b.z);
+
+    std::cout << "print launch params: " << std::endl;
+    std::cout << "width              : " << params.width << std::endl;
+    std::cout << "height             : " << params.height << std::endl;
+    std::cout << "max_depth          : " << params.max_depth << std::endl;
+    // std::cout << "hit_point_buffer   : " << params.hit_point_buffer << std::endl;
+    std::cout << "hit_buffer         : " << params.hit_buffer << std::endl;
+    std::cout << "sun_dir_buffer     : " << params.sun_dir_buffer << std::endl;
+    std::cout << "sun_vector         : " << params.sun_vector.x << " " << params.sun_vector.y << " " << params.sun_vector.z << std::endl;
+    // std::cout << "max_sun_angle      : " << params.max_sun_angle << std::endl;
+    std::cout << "sun_v0             : " << params.sun_v0.x << " " << params.sun_v0.y << " " << params.sun_v0.z << std::endl;
+    std::cout << "sun_v1             : " << params.sun_v1.x << " " << params.sun_v1.y << " " << params.sun_v1.z << std::endl;
+    std::cout << "sun_v2             : " << params.sun_v2.x << " " << params.sun_v2.y << " " << params.sun_v2.z << std::endl;
+    std::cout << "sun_v3             : " << params.sun_v3.x << " " << params.sun_v3.y << " " << params.sun_v3.z << std::endl;
+    std::cout << "sun_box_edge_a     : " << sun_box_edge_a << std::endl;
+    std::cout << "sun_box_edge_b     : " << sun_box_edge_b << std::endl;
 }
 
 void SolTraceSystem::initialize()
@@ -129,8 +169,18 @@ void SolTraceSystem::initialize()
         OPTIX_CHECK(optixDeviceContextCreate(cuCtx, &options, &m_state.context));
     }
 
-    size_t mem_total;
-    cudaMemGetInfo(&m_mem_free_before, &mem_total);
+    // Create the CUDA stream immediately after the context so that all subsequent
+    // GPU work (GAS build, kernel launches, optixLaunch) uses the same named stream.
+    // Doing this before create_geometries() ensures optixAccelBuild and optixLaunch
+    // share a single stream, giving explicit serial ordering without relying solely
+    // on legacy null-stream synchronization semantics.
+    if (!m_state.stream)
+        CUDA_CHECK(cudaStreamCreate(&m_state.stream));
+
+    {
+        size_t mem_total;
+        CUDA_CHECK(cudaMemGetInfo(&m_mem_free_before, &mem_total));
+    }
     m_timer_setup.start();
 
     // set up input related to sun
@@ -139,24 +189,25 @@ void SolTraceSystem::initialize()
     sun_vec_norm = glm::normalize(sun_vec_norm);
 
     data_manager->launch_params_H.sun_vector = make_float3(static_cast<float>(sun_vec_norm[0]),
-                                                           static_cast<float>(sun_vec_norm[1]), static_cast<float>(sun_vec_norm[2]));
+                                                           static_cast<float>(sun_vec_norm[1]),
+                                                           static_cast<float>(sun_vec_norm[2]));
 
     // Set generation type
     switch (m_sun->get_gen_type())
     {
-        case(SolTrace::Data::GenType::RANDOM):
-            data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::RANDOM;
-            break;
-        case(SolTrace::Data::GenType::HALTON):
-            data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::HALTON;
-            break;
-        default:
-            data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::UNKNOWN;
+    case (SolTrace::Data::GenType::RANDOM):
+        data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::RANDOM;
+        break;
+    case (SolTrace::Data::GenType::HALTON):
+        data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::HALTON;
+        break;
+    default:
+        data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::UNKNOWN;
     }
 
     // Assign sun shape parameters (if necessary)
     data_manager->launch_params_H.include_sun_shape_errors = this->m_include_sun_shape_errors;
-    data_manager->allocateSunUserData({}, {});  // Clear sun user data
+    data_manager->allocateSunUserData({}, {}); // Clear sun user data
     if (this->m_include_sun_shape_errors)
     {
         // Map SolTrace::Data::SunShape to OptixCSP::SunShape for device code
@@ -208,136 +259,205 @@ void SolTraceSystem::initialize()
         data_manager->launch_params_H.sun_max_intensity = static_cast<float>(m_sun->get_max_intensity());
     }
 
-    Timer AABB_timer;
-    AABB_timer.start();
+    m_timer_aabb.reset();
+    m_timer_aabb.start();
     geometry_manager->collect_geometry_info(m_element_list, data_manager->launch_params_H);
-    AABB_timer.stop();
+    m_timer_aabb.stop();
 
-    Timer geometry_timer;
-    geometry_timer.start();
+    m_timer_geometry.reset();
+    m_timer_geometry.start();
     geometry_manager->create_geometries(data_manager->launch_params_H);
-    geometry_timer.stop();
+    m_timer_geometry.stop();
 
     // Pipeline setup.
-    Timer pipeline_timer;
-    pipeline_timer.start();
+    m_timer_pipeline.reset();
+    m_timer_pipeline.start();
+    pipeline_manager->set_max_trace_depth(m_max_ray_depth);
     pipeline_manager->createPipeline();
-    pipeline_timer.stop();
+    m_timer_pipeline.stop();
 
-    Timer sbt_timer;
-    sbt_timer.start();
+    m_timer_sbt.reset();
+    m_timer_sbt.start();
     create_shader_binding_table();
-    sbt_timer.stop();
+    m_timer_sbt.stop();
 
     // seed for randomization
     data_manager->launch_params_H.sun_dir_seed = m_seed;
     data_manager->launch_params_H.optical_errors = m_optical_errors;
 
-    // Create a CUDA stream for asynchronous operations.
-    CUDA_CHECK(cudaStreamCreate(&m_state.stream));
-
     // Link the GAS handle.
     data_manager->launch_params_H.handle = m_state.gas_handle;
     data_manager->allocateGeometryDataArray(geometry_manager->get_geometry_data_array());
     data_manager->allocateMaterialDataArray(geometry_manager->get_material_data_array_front(),
                                             geometry_manager->get_material_data_array_back());
-    
+
     if (m_verbose)
     {
-        std::cout << "Time to compute AABB: " << AABB_timer.get_time_sec() << " seconds" << std::endl;
-        std::cout << "Time to create geometries: " << geometry_timer.get_time_sec() << " seconds" << std::endl;
-        std::cout << "Time to create pipeline: " << pipeline_timer.get_time_sec() << " seconds" << std::endl;
-        std::cout << "Time to create SBT: " << sbt_timer.get_time_sec() << " seconds" << std::endl;
+        std::cout << "Time to compute AABB: " << m_timer_aabb.get_time_sec() << " seconds" << std::endl;
+        std::cout << "Time to create geometries: " << m_timer_geometry.get_time_sec() << " seconds" << std::endl;
+        std::cout << "Time to create pipeline: " << m_timer_pipeline.get_time_sec() << " seconds" << std::endl;
+        std::cout << "Time to create SBT: " << m_timer_sbt.get_time_sec() << " seconds" << std::endl;
 
         print_launch_params();
     }
-        
 
     data_manager->allocateLaunchParams();
+
+    // Snapshot free GPU memory now that all setup allocations (BVH, pipeline,
+    // SBT, geometry/material arrays, launch params) are complete but before any
+    // ray buffers exist.  automatic_batch_size() uses this as a stable baseline
+    // so that batch sizing is consistent across every run() call.
+    // Memory used by setup = m_mem_free_before - m_mem_free_post_setup.
+    {
+        size_t mem_total;
+        CUDA_CHECK(cudaMemGetInfo(&m_mem_free_post_setup, &mem_total));
+    }
+
     m_timer_setup.stop();
 }
 
 void SolTraceSystem::run()
 {
-
-    // Initialize results vectors
-    m_hp_vec.clear();
-    m_raynumber_vec.clear();
-    m_element_id_vec.clear();
-    m_hit_type_vec.clear();
+    // Initialize results
+    m_hit_records.clear();
+    m_hit_ray_ids.clear();
+    m_n_hit_rays = 0;
+    m_n_sun_rays = 0;
+    m_n_depth_exceeded_rays = 0;
     uint_fast64_t N_ray_hit = 0;
     uint_fast64_t N_ray_gen = 0;
 
+    m_timer_trace.reset();
+    m_timer_trace.start();
+    m_timer_setup_buffer.reset();
+    m_timer_optix_launch.reset();
+    m_timer_collect_results.reset();
+    m_n_run_iterations = 0;
+    m_compaction_timings = CompactionTimings{};
+
+    // Allocate device buffers and initialize RNG states once (sizes are constant across the while loop).
+    allocate_device_buffers();
+
     while (N_ray_hit < m_number_of_rays && N_ray_gen < m_max_number_of_rays)
     {
+        ++m_n_run_iterations;
+
         // Update ray offset (pushed to device in setup_device_buffer)
         data_manager->launch_params_H.ray_offset = N_ray_gen;
 
         // Allocate buffer (sets data_manager->launch_params_H buffer)
-        setup_device_buffer();
+        m_timer_setup_buffer.start();
+        {
+            setup_device_buffer();
+        }
+        m_timer_setup_buffer.stop();
 
         int width = data_manager->launch_params_H.width;
         int height = data_manager->launch_params_H.height;
 
-        size_t m_mem_free_after;
-	    size_t mem_total;
+        size_t mem_total;
         cudaMemGetInfo(&m_mem_free_after, &mem_total);
 
-        if(m_verbose)
+        if (m_verbose)
             std::cout << "Memory used by launch: " << (m_mem_free_before - m_mem_free_after) / (1024.0 * 1024.0) << " MB\n";
 
-        m_timer_trace.start();
         // Launch the simulation.
-        OPTIX_CHECK(optixLaunch(
-            m_state.pipeline,
-            m_state.stream, // Assume this stream is properly created.
-            reinterpret_cast<CUdeviceptr>(data_manager->getDeviceLaunchParams()),
-            sizeof(OptixCSP::LaunchParams),
-            &m_state.sbt, // Shader Binding Table.
-            width,        // Launch dimensions
-            height,
-            1));
-        CUDA_SYNC_CHECK();
+        m_timer_optix_launch.start();
+        {
+            OPTIX_CHECK(optixLaunch(
+                m_state.pipeline,
+                m_state.stream, // Assume this stream is properly created.
+                reinterpret_cast<CUdeviceptr>(data_manager->getDeviceLaunchParams()),
+                sizeof(OptixCSP::LaunchParams),
+                &m_state.sbt, // Shader Binding Table.
+                width,        // Launch dimensions
+                height,
+                1));
+            CUDA_SYNC_CHECK();
+        }
+        m_timer_optix_launch.stop();
 
         // Collect results
-        get_buffer_results(m_hp_vec, m_raynumber_vec, m_element_id_vec, m_hit_type_vec,
-                           m_sunraynumber_vec);
-        N_ray_hit = m_raynumber_vec.empty() ? 0 : m_raynumber_vec.back();
+        m_timer_collect_results.start();
+        {
+            get_buffer_results();
+        }
+        m_timer_collect_results.stop();
+
+        // Read back depth-exceeded count for this batch
+        uint64_t iter_depth_exceeded = 0;
+        CUDA_CHECK(cudaMemcpy(&iter_depth_exceeded,
+                              data_manager->launch_params_H.d_depth_exceeded_count,
+                              sizeof(uint64_t), cudaMemcpyDeviceToHost));
+        m_n_depth_exceeded_rays += iter_depth_exceeded;
+
+        N_ray_hit = m_n_hit_rays;
         N_ray_gen += width;
+        m_n_sun_rays = N_ray_gen;
     }
 
-    // Trim excess rays
-    if (N_ray_hit > m_number_of_rays)
+    // Trim excess rays: remove ray groups from the tail until m_n_hit_rays == m_number_of_rays.
+    // Each group starts at the last HIT_CREATE record in m_hit_records.
+    while (m_trim_excess_rays && m_n_hit_rays > m_number_of_rays && !m_hit_records.empty())
     {
-        while (m_raynumber_vec.back() > m_number_of_rays)
-        {
-            m_hp_vec.pop_back();
-            m_raynumber_vec.pop_back();
-            m_element_id_vec.pop_back();
-            m_hit_type_vec.pop_back();
-            m_sunraynumber_vec.pop_back();
-        }
+        // Walk backwards to find the last CREATE record
+        auto rit = std::find_if(m_hit_records.rbegin(), m_hit_records.rend(),
+                                [](const HitRecord &r)
+                                { return r.hit_type == HitType::HIT_CREATE; });
+        if (rit == m_hit_records.rend())
+            break;
+        m_hit_records.erase(std::prev(rit.base()), m_hit_records.end());
+        m_hit_ray_ids.pop_back();
+        --m_n_hit_rays;
     }
+    // m_n_sun_rays = rays generated up to and including the last retained hit ray.
+    if (!m_hit_ray_ids.empty())
+        m_n_sun_rays = m_hit_ray_ids.back() + 1;
 
     m_timer_trace.stop();
+
+    if (m_n_depth_exceeded_rays > 0)
+        std::cout << "[SolTraceSystem] " << m_n_depth_exceeded_rays
+                  << " ray(s) were terminated due to reaching max_depth ("
+                  << static_cast<unsigned>(data_manager->launch_params_H.max_depth) << ").\n";
+
+    if (m_verbose)
+    {
+        const double t_setup = m_timer_setup_buffer.get_time_sec();
+        const double t_launch = m_timer_optix_launch.get_time_sec();
+        const double t_collect = m_timer_collect_results.get_time_sec();
+        const double t_total = t_setup + t_launch + t_collect;
+        const double inv_n = m_n_run_iterations > 0 ? 1.0 / static_cast<double>(m_n_run_iterations) : 0.0;
+
+        std::cout << "\n--- SolTraceSystem::run() timing (" << m_n_run_iterations << " iteration"
+                  << (m_n_run_iterations == 1 ? "" : "s") << ") ---\n";
+        std::cout << std::fixed << std::setprecision(6);
+        std::cout << "  setup_device_buffer : total = " << t_setup << " s"
+                  << "  avg = " << t_setup * inv_n << " s"
+                  << "  fraction = " << (t_total > 0.0 ? 100.0 * t_setup / t_total : 0.0) << " %\n";
+        std::cout << "  optixLaunch         : total = " << t_launch << " s"
+                  << "  avg = " << t_launch * inv_n << " s"
+                  << "  fraction = " << (t_total > 0.0 ? 100.0 * t_launch / t_total : 0.0) << " %\n";
+        std::cout << "  get_buffer_results  : total = " << t_collect << " s"
+                  << "  avg = " << t_collect * inv_n << " s"
+                  << "  fraction = " << (t_total > 0.0 ? 100.0 * t_collect / t_total : 0.0) << " %\n";
+        std::cout << "  total (3 sections)  : " << t_total << " s\n";
+        std::cout << "----------------------------------------------\n";
+    }
 }
 
 void SolTraceSystem::update()
 {
 
-    const int N_slots = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth;
-    const size_t hit_point_buffer_size = N_slots * sizeof(float4);
-    const size_t element_id_size = N_slots * sizeof(int32_t);
-    const size_t hit_type_buffer_size = N_slots * sizeof(uint8_t);
+    const size_t N_slots = static_cast<size_t>(data_manager->launch_params_H.width) * static_cast<size_t>(data_manager->launch_params_H.height) * static_cast<size_t>(data_manager->launch_params_H.max_depth);
+    const size_t hit_buffer_size = N_slots * sizeof(HitRecord);
 
     // update aabb and sun plane accordingly
     geometry_manager->update_geometry_info(m_element_list, data_manager->launch_params_H);
 
     // update data on the device
     data_manager->updateGeometryDataArray(geometry_manager->get_geometry_data_array());
-    CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_point_buffer, 0, hit_point_buffer_size));
-    CUDA_CHECK(cudaMemset(data_manager->launch_params_H.element_id_buffer, kElementIdBuffer, element_id_size));
-    CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_type_buffer, HitType::HIT_UNASSIGNED, hit_type_buffer_size));
+    CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_buffer, 0, hit_buffer_size));
 
     data_manager->updateLaunchParams();
 }
@@ -347,10 +467,25 @@ void SolTraceSystem::get_hp_output(std::vector<float4> &hp_vec,
                                    std::vector<int32_t> &element_id_vec,
                                    std::vector<uint8_t> &hit_type_vec)
 {
-    hp_vec = m_hp_vec;
-    raynumber_vec = m_raynumber_vec;
-    element_id_vec = m_element_id_vec;
-    hit_type_vec = m_hit_type_vec;
+    hp_vec.clear();
+    raynumber_vec.clear();
+    element_id_vec.clear();
+    hit_type_vec.clear();
+    hp_vec.reserve(m_hit_records.size());
+    raynumber_vec.reserve(m_hit_records.size());
+    element_id_vec.reserve(m_hit_records.size());
+    hit_type_vec.reserve(m_hit_records.size());
+
+    uint_fast64_t ray_number = 0;
+    for (const HitRecord &r : m_hit_records)
+    {
+        if (r.hit_type == HitType::HIT_CREATE)
+            ++ray_number;
+        hp_vec.push_back(r.hit_point);
+        raynumber_vec.push_back(ray_number);
+        element_id_vec.push_back(r.element_id);
+        hit_type_vec.push_back(r.hit_type);
+    }
 }
 
 void SolTraceSystem::clean_up()
@@ -386,28 +521,17 @@ void SolTraceSystem::clean_up()
     CUDA_CHECK(cudaFree(reinterpret_cast<void *>(m_state.d_gas_output_buffer)));
 
     // Free device-side launch parameter memory
-    CUDA_CHECK(cudaFree(reinterpret_cast<void *>(data_manager->launch_params_H.hit_point_buffer)));
-    CUDA_CHECK(cudaFree(reinterpret_cast<void *>(data_manager->launch_params_H.element_id_buffer)));
-    CUDA_CHECK(cudaFree(reinterpret_cast<void *>(data_manager->launch_params_H.hit_type_buffer)));
+    CUDA_CHECK(cudaFree(reinterpret_cast<void *>(data_manager->launch_params_H.hit_buffer)));
     CUDA_CHECK(cudaFree(reinterpret_cast<void *>(data_manager->launch_params_H.sun_dir_buffer)));
 
-    data_manager->launch_params_H.hit_point_buffer = nullptr;
-    data_manager->launch_params_H.element_id_buffer = nullptr;
-    data_manager->launch_params_H.hit_type_buffer = nullptr;
+    data_manager->launch_params_H.hit_buffer = nullptr;
     data_manager->launch_params_H.sun_dir_buffer = nullptr;
-    m_hit_point_buffer_size_allocated = 0;
-    m_element_id_buffer_size_allocated = 0;
-    m_hit_type_buffer_size_allocated = 0;
+    m_hit_buffer_size_allocated = 0;
     m_sun_dir_buffer_size_allocated = 0;
 
-    data_manager->cleanup();
+    free_compaction_scratch(m_compaction_scratch);
 
-    m_hp_output_buffer_host.clear();
-    m_hp_output_buffer_host.shrink_to_fit();
-    m_element_id_buffer_host.clear();
-    m_element_id_buffer_host.shrink_to_fit();
-    m_hit_type_buffer_host.clear();
-    m_hit_type_buffer_host.shrink_to_fit();
+    data_manager->cleanup();
 
     m_state.context = nullptr;
     m_state.stream = nullptr;
@@ -420,6 +544,10 @@ void SolTraceSystem::clean_up()
     m_state.gas_handle = 0;
     m_state.sbt = {};
     m_state.d_gas_output_buffer = 0;
+
+    m_mem_free_before = 0;
+    m_mem_free_post_setup = 0;
+    m_mem_free_after = 0;
 }
 
 void SolTraceSystem::reset()
@@ -427,15 +555,11 @@ void SolTraceSystem::reset()
     clean_up();
 
     m_element_list.clear();
-    m_hp_vec.clear();
-    m_raynumber_vec.clear();
-    m_element_id_vec.clear();
-    m_hit_type_vec.clear();
-    m_sunraynumber_vec.clear();
-
-    m_hp_output_buffer_host.clear();
-    m_element_id_buffer_host.clear();
-    m_hit_type_buffer_host.clear();
+    m_hit_records.clear();
+    m_hit_ray_ids.clear();
+    m_n_hit_rays = 0;
+    m_n_sun_rays = 0;
+    m_n_depth_exceeded_rays = 0;
 
     m_sun = nullptr;
     m_number_of_rays = 0;
@@ -447,6 +571,11 @@ void SolTraceSystem::reset()
 // with their corresponding programs (ray generation, miss, and hit group).
 void SolTraceSystem::create_shader_binding_table()
 {
+    // Free any previously allocated SBT records to avoid leaks on re-initialization.
+    CUDA_CHECK(cudaFree(reinterpret_cast<void *>(m_state.sbt.raygenRecord)));
+    CUDA_CHECK(cudaFree(reinterpret_cast<void *>(m_state.sbt.missRecordBase)));
+    CUDA_CHECK(cudaFree(reinterpret_cast<void *>(m_state.sbt.hitgroupRecordBase)));
+    m_state.sbt = {};
 
     // Ray generation program record
     {
@@ -512,55 +641,6 @@ void SolTraceSystem::create_shader_binding_table()
             // initialize program handle and data
             OptixProgramGroup program_group_handle = pipeline_manager->getElementProgram(my_type);
             hitgroup_records_list[i].data.material_data = {0.875425, 0, 0, 0};
-            // OptixProgramGroup program_group_handle = nullptr;
-            // SurfaceApertureMap map = {};
-
-            // switch (my_type)
-            // {
-            // case OptixCSP::OpticalEntityType::RECTANGLE_FLAT:
-            //     map = {SurfaceType::FLAT, ApertureType::RECTANGLE};
-            //     program_group_handle = pipeline_manager->getElementProgram(map);
-            //     hitgroup_records_list[i].data.material_data = {0.875425, 0, 0, 0};
-            //     printf("RECTANGLE_FLAT, program group address: %p \n", program_group_handle);
-
-            //     break;
-
-            // case OptixCSP::OpticalEntityType::RECTANGLE_PARABOLIC:
-            //     map = {SurfaceType::PARABOLIC, ApertureType::RECTANGLE};
-            //     program_group_handle = pipeline_manager->getElementProgram(map);
-            //     hitgroup_records_list[i].data.material_data = {0.875425, 0, 0, 0};
-            //     printf("RECTANGLE_PARABOLIC, program group address: %p \n", program_group_handle);
-
-            //     break;
-
-            // case OptixCSP::OpticalEntityType::CYLINDRICAL:
-            //     map = {SurfaceType::CYLINDER, ApertureType::RECTANGLE};
-            //     program_group_handle = pipeline_manager->getElementProgram(map);
-            //     hitgroup_records_list[i].data.material_data = {0.95, 0, 0, 0};
-            //     printf("CYLINDRICAL, program group address: %p \n", program_group_handle);
-
-            //     break;
-
-            // case OptixCSP::OpticalEntityType::TRIANGLE_FLAT:
-            //     map = {SurfaceType::FLAT, ApertureType::TRIANGLE};
-            //     program_group_handle = pipeline_manager->getElementProgram(map);
-            //     hitgroup_records_list[i].data.material_data = {0.95, 0, 0, 0};
-            //     printf("FLAT_TRIANGLE, program group address: %p \n", program_group_handle);
-
-            //     break;
-
-            // case OptixCSP::OpticalEntityType::QUADRILATERAL_FLAT:
-            //     ma = {SurfaceType::FLAT, ApertureType::QUADRILATERAL};
-            //     program_group_handle = pipeline_manager->getElementProgram(map);
-            //     hitgroup_records_list[i].data.material_data = {0.875425, 0, 0, 0};
-            //     printf("FLAT_QUADRILATERAL, program group address: %p \n", program_group_handle);
-
-            //     break;
-
-            // default:
-            //     std::cerr << "Unknown OpticalEntityType: " << my_type << std::endl;
-            // }
-
             OPTIX_CHECK(optixSbtRecordPackHeader(program_group_handle, &hitgroup_records_list[i].header));
         }
 
@@ -585,144 +665,84 @@ void SolTraceSystem::create_shader_binding_table()
     }
 }
 
-void SolTraceSystem::setup_device_buffer()
+void SolTraceSystem::allocate_device_buffers()
 {
-    // Initialize launch params
-    data_manager->launch_params_H.width = m_number_of_rays;
+    // Set constant launch params (unchanged across the while loop).
+    const uint_fast64_t effective_batch = determine_batch_size();
+    data_manager->launch_params_H.width = static_cast<int>(effective_batch);
     data_manager->launch_params_H.height = 1;
-    data_manager->launch_params_H.max_depth = MAX_TRACE_DEPTH;
+    data_manager->launch_params_H.max_depth = m_max_ray_depth;
 
-    const size_t hit_point_buffer_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(float4) * data_manager->launch_params_H.max_depth;
-    const size_t element_id_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(int32_t) * data_manager->launch_params_H.max_depth;
-    const size_t hit_type_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(uint8_t) * data_manager->launch_params_H.max_depth;
-    const size_t sun_dir_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(float3);
+    const size_t hit_buffer_size = static_cast<size_t>(data_manager->launch_params_H.width) * static_cast<size_t>(data_manager->launch_params_H.height) * static_cast<size_t>(data_manager->launch_params_H.max_depth) * sizeof(HitRecord);
+    const size_t sun_dir_size = static_cast<size_t>(data_manager->launch_params_H.width) * static_cast<size_t>(data_manager->launch_params_H.height) * sizeof(float3);
 
-    if (data_manager->launch_params_H.hit_point_buffer == nullptr || m_hit_point_buffer_size_allocated != hit_point_buffer_size)
-    {
-        if (data_manager->launch_params_H.hit_point_buffer != nullptr)
-            CUDA_CHECK(cudaFree(reinterpret_cast<void *>(data_manager->launch_params_H.hit_point_buffer)));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&data_manager->launch_params_H.hit_point_buffer), hit_point_buffer_size));
-        m_hit_point_buffer_size_allocated = hit_point_buffer_size;
-    }
-    CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_point_buffer, 0, hit_point_buffer_size));
-
-    if (data_manager->launch_params_H.element_id_buffer == nullptr || m_element_id_buffer_size_allocated != element_id_size)
-    {
-        if (data_manager->launch_params_H.element_id_buffer != nullptr)
-            CUDA_CHECK(cudaFree(reinterpret_cast<void *>(data_manager->launch_params_H.element_id_buffer)));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&data_manager->launch_params_H.element_id_buffer), element_id_size));
-        m_element_id_buffer_size_allocated = element_id_size;
-    }
-    CUDA_CHECK(cudaMemset(data_manager->launch_params_H.element_id_buffer, kElementIdBuffer, element_id_size));
+    // NOTE: cudaFree is nullptr safe
 
-    if (data_manager->launch_params_H.hit_type_buffer == nullptr || m_hit_type_buffer_size_allocated != hit_type_size)
+    if (data_manager->launch_params_H.hit_buffer == nullptr || m_hit_buffer_size_allocated != hit_buffer_size)
     {
-        if (data_manager->launch_params_H.hit_type_buffer != nullptr)
-            CUDA_CHECK(cudaFree(reinterpret_cast<void *>(data_manager->launch_params_H.hit_type_buffer)));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&data_manager->launch_params_H.hit_type_buffer), hit_type_size));
-        m_hit_type_buffer_size_allocated = hit_type_size;
+        CUDA_CHECK(cudaFree(reinterpret_cast<void *>(data_manager->launch_params_H.hit_buffer)));
+        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&data_manager->launch_params_H.hit_buffer), hit_buffer_size));
+        m_hit_buffer_size_allocated = hit_buffer_size;
+
+        // Reallocate compaction scratch whenever ray-buffer dimensions change
+        const uint64_t num_rays = data_manager->launch_params_H.width * data_manager->launch_params_H.height;
+        const uint64_t max_depth = static_cast<uint64_t>(data_manager->launch_params_H.max_depth);
+        allocate_compaction_scratch(m_compaction_scratch, num_rays, max_depth);
     }
-    CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_type_buffer, HitType::HIT_UNASSIGNED, hit_type_size));
 
     if (data_manager->launch_params_H.sun_dir_buffer == nullptr || m_sun_dir_buffer_size_allocated != sun_dir_size)
     {
-        if (data_manager->launch_params_H.sun_dir_buffer != nullptr)
-            CUDA_CHECK(cudaFree(reinterpret_cast<void *>(data_manager->launch_params_H.sun_dir_buffer)));
+        CUDA_CHECK(cudaFree(reinterpret_cast<void *>(data_manager->launch_params_H.sun_dir_buffer)));
         CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&data_manager->launch_params_H.sun_dir_buffer), sun_dir_size));
         m_sun_dir_buffer_size_allocated = sun_dir_size;
     }
-    CUDA_CHECK(cudaMemset(data_manager->launch_params_H.sun_dir_buffer, 0, sun_dir_size));
 
-    const unsigned int num_rng_states = static_cast<unsigned int>(data_manager->launch_params_H.width * data_manager->launch_params_H.height);
+    // Initialize RNG states once (sizes are constant across the while loop).
+    // curand states are persistent on the device and advance naturally across kernel launches.
+    const unsigned int num_rng_states = static_cast<unsigned int>(
+        data_manager->launch_params_H.width * data_manager->launch_params_H.height);
     data_manager->ensureCurandStates(
         num_rng_states,
         data_manager->launch_params_H.sun_dir_seed,
-        data_manager->launch_params_H.ray_offset,
+        0,
         m_state.stream);
 
-    data_manager->updateLaunchParams();
+    data_manager->ensureDepthExceededCounter();
 }
 
-// Collects results from device buffer
-// only keeps rays that hit elements
-void SolTraceSystem::get_buffer_results(std::vector<float4> &hp_vec, std::vector<uint_fast64_t> &raynumber_vec,
-                                        std::vector<int32_t> &element_id_vec, std::vector<uint8_t> &hit_type_vec,
-                                        std::vector<uint_fast64_t> &sunraynumber_vec)
+void SolTraceSystem::setup_device_buffer()
 {
-    const int max_depth = data_manager->launch_params_H.max_depth;
-    const int num_rays = data_manager->launch_params_H.width * data_manager->launch_params_H.height;
-    const int output_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth;
-
-    if (static_cast<int>(m_hp_output_buffer_host.size()) != output_size)
-        m_hp_output_buffer_host.resize(output_size);
-    if (static_cast<int>(m_element_id_buffer_host.size()) != output_size)
-        m_element_id_buffer_host.resize(output_size);
-    if (static_cast<int>(m_hit_type_buffer_host.size()) != output_size)
-        m_hit_type_buffer_host.resize(output_size);
-
-    CUDA_CHECK(cudaMemcpy(m_hp_output_buffer_host.data(), data_manager->launch_params_H.hit_point_buffer, output_size * sizeof(float4), cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaMemcpy(m_element_id_buffer_host.data(), data_manager->launch_params_H.element_id_buffer, output_size * sizeof(int32_t), cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaMemcpy(m_hit_type_buffer_host.data(), data_manager->launch_params_H.hit_type_buffer, output_size * sizeof(uint8_t), cudaMemcpyDeviceToHost));
-
-    // Loop through each buffer slot
-    uint_fast64_t ray_number = raynumber_vec.empty() ? 0 : raynumber_vec.back();
-    uint_fast64_t sunray_number = sunraynumber_vec.empty() ? 0 : sunraynumber_vec.back();
-    for (int i = 0; i < output_size; ++i)
-    {
-
-        // Get hit type
-        const uint8_t &hit_type = m_hit_type_buffer_host[i];
-
-        // Skip if empty
-        if (hit_type < HitType::HIT_CREATE || hit_type > HitType::HIT_EXIT)
-        {
-            continue;
-        }
+    const size_t hit_buffer_size = static_cast<size_t>(data_manager->launch_params_H.width) * static_cast<size_t>(data_manager->launch_params_H.height) * static_cast<size_t>(data_manager->launch_params_H.max_depth) * sizeof(HitRecord);
+    const size_t sun_dir_size = static_cast<size_t>(data_manager->launch_params_H.width) * static_cast<size_t>(data_manager->launch_params_H.height) * sizeof(float3);
 
-        // If new ray, check if previous ray hit anything
-        if (hit_type == HitType::HIT_CREATE)
-        {
-            // Remove last ray if it has no hits
-            if (!hit_type_vec.empty() && hit_type_vec.back() == HitType::HIT_CREATE)
-            {
-                hp_vec.pop_back();
-                raynumber_vec.pop_back();
-                hit_type_vec.pop_back();
-                element_id_vec.pop_back();
-                sunraynumber_vec.pop_back();
-                ray_number--;
-            }
-
-            // New ray
-            ray_number++;
-
-            // Sun ray number always increments, even if no hit
-            sunray_number++;
-        }
-
-        // Get hit record, element_id
-        const float4 &hit_record = m_hp_output_buffer_host[i]; // [depth, pos x, pos y, pos z]
-        const int32_t &element_id = m_element_id_buffer_host[i];
-
-        // Collect results
-        hp_vec.push_back(hit_record);
-        raynumber_vec.push_back(ray_number);
-        hit_type_vec.push_back(hit_type);
-        element_id_vec.push_back(element_id);
-        sunraynumber_vec.push_back(sunray_number);
-    }
+    CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_buffer, 0, hit_buffer_size));
+    CUDA_CHECK(cudaMemset(data_manager->launch_params_H.sun_dir_buffer, 0, sun_dir_size));
+    CUDA_CHECK(cudaMemset(data_manager->launch_params_H.d_depth_exceeded_count, 0, sizeof(uint64_t)));
 
-    // Remove last ray if it is only CREATE
-    if (!hit_type_vec.empty() && hit_type_vec.back() == HitType::HIT_CREATE)
-    {
-        hp_vec.pop_back();
-        raynumber_vec.pop_back();
-        element_id_vec.pop_back();
-        hit_type_vec.pop_back();
-        sunraynumber_vec.pop_back();
-    }
+    data_manager->updateLaunchParams();
+}
 
-    return;
+// Compacts the device hit buffer on the GPU, then copies only qualifying records to host.
+// Rays that produced only a HIT_CREATE event (missed all elements) are discarded.
+// Empty depth slots are discarded. The compacted HitRecord array is appended to
+// m_hit_records and m_n_hit_rays is incremented by the number of newly collected hit rays.
+void SolTraceSystem::get_buffer_results()
+{
+    const uint32_t num_rays = static_cast<uint32_t>(data_manager->launch_params_H.width *
+                                                    data_manager->launch_params_H.height);
+    const uint32_t max_depth = static_cast<uint32_t>(data_manager->launch_params_H.max_depth);
+
+    const uint32_t n_new_hits = gpu_compact_hit_buffer(
+        data_manager->launch_params_H.hit_buffer,
+        num_rays,
+        max_depth,
+        data_manager->launch_params_H.ray_offset,
+        m_hit_records,
+        m_hit_ray_ids,
+        m_state.stream,
+        m_compaction_scratch,
+        &m_compaction_timings);
+    m_n_hit_rays += n_new_hits;
 }
 
 void SolTraceSystem::add_element(std::shared_ptr<CspElement> e)
@@ -740,6 +760,89 @@ double SolTraceSystem::get_time_setup()
     return m_timer_setup.get_time_sec();
 }
 
+void SolTraceSystem::print_timing() const
+{
+    const double t_setup = m_timer_setup.get_time_sec();
+    const double t_aabb = m_timer_aabb.get_time_sec();
+    const double t_geometry = m_timer_geometry.get_time_sec();
+    const double t_pipeline = m_timer_pipeline.get_time_sec();
+    const double t_sbt = m_timer_sbt.get_time_sec();
+
+    const double t_trace = m_timer_trace.get_time_sec();
+    const double t_buf_setup = m_timer_setup_buffer.get_time_sec();
+    const double t_launch = m_timer_optix_launch.get_time_sec();
+    const double t_collect = m_timer_collect_results.get_time_sec();
+
+    const double inv_n = m_n_run_iterations > 0
+                             ? 1.0 / static_cast<double>(m_n_run_iterations)
+                             : 0.0;
+
+    const auto pct = [](double num, double denom) -> double
+    {
+        return denom > 0.0 ? 100.0 * num / denom : 0.0;
+    };
+
+    std::cout << std::fixed << std::setprecision(6);
+    std::cout << "\n=== SolTraceSystem Timing Summary ===\n";
+
+    std::cout << "\n--- initialize() ---\n";
+    std::cout << "  AABB computation    : " << t_aabb << " s  (" << pct(t_aabb, t_setup) << " %)\n";
+    std::cout << "  Geometry creation   : " << t_geometry << " s  (" << pct(t_geometry, t_setup) << " %)\n";
+    std::cout << "  Pipeline creation   : " << t_pipeline << " s  (" << pct(t_pipeline, t_setup) << " %)\n";
+    std::cout << "  SBT creation        : " << t_sbt << " s  (" << pct(t_sbt, t_setup) << " %)\n";
+    std::cout << "  Total setup         : " << t_setup << " s\n";
+
+    std::cout << "\n--- run() [" << m_n_run_iterations
+              << " iteration" << (m_n_run_iterations == 1 ? "" : "s") << "] ---\n";
+    std::cout << "  Setup device buffer : total = " << t_buf_setup << " s"
+              << "  avg/iter = " << t_buf_setup * inv_n << " s"
+              << "  (" << pct(t_buf_setup, t_trace) << " %)\n";
+    std::cout << "  OptiX launch        : total = " << t_launch << " s"
+              << "  avg/iter = " << t_launch * inv_n << " s"
+              << "  (" << pct(t_launch, t_trace) << " %)\n";
+    std::cout << "  Collect results     : total = " << t_collect << " s"
+              << "  avg/iter = " << t_collect * inv_n << " s"
+              << "  (" << pct(t_collect, t_trace) << " %)\n";
+    if (m_compaction_timings.n_calls > 0)
+    {
+        const float inv_c = 1.0f / static_cast<float>(m_compaction_timings.n_calls);
+        std::cout << std::fixed << std::setprecision(4);
+        std::cout << "    GPU pass 1 (count/scan/reduce) : total = " << m_compaction_timings.gpu_phase1_ms << " ms"
+                  << "  avg/call = " << m_compaction_timings.gpu_phase1_ms * inv_c << " ms\n";
+        std::cout << "    D->H scalars (3x memcpy)       : total = " << m_compaction_timings.scalar_dth_ms << " ms"
+                  << "  avg/call = " << m_compaction_timings.scalar_dth_ms * inv_c << " ms\n";
+        std::cout << "    GPU pass 2 (compact/select)    : total = " << m_compaction_timings.gpu_phase2_ms << " ms"
+                  << "  avg/call = " << m_compaction_timings.gpu_phase2_ms * inv_c << " ms\n";
+        std::cout << "    D->H bulk (records+ids)        : total = " << m_compaction_timings.bulk_dth_ms << " ms"
+                  << "  avg/call = " << m_compaction_timings.bulk_dth_ms * inv_c << " ms\n";
+        std::cout << std::fixed << std::setprecision(6);
+    }
+    std::cout << "  Total trace         : " << t_trace << " s\n";
+
+    std::cout << "\n--- Grand Total ---\n";
+    std::cout << "  Setup + Trace       : " << (t_setup + t_trace) << " s\n";
+
+    std::cout << "\n--- GPU Memory Usage ---\n";
+    constexpr double kMB = 1.0 / (1024.0 * 1024.0);
+    if (m_mem_free_before > 0)
+    {
+        std::cout << std::fixed << std::setprecision(2);
+        std::cout << "  Free before setup   : " << m_mem_free_before * kMB << " MB\n";
+        if (m_mem_free_post_setup > 0)
+        {
+            std::cout << "  Free after setup    : " << m_mem_free_post_setup * kMB << " MB\n";
+            std::cout << "  Setup structures    : " << (m_mem_free_before - m_mem_free_post_setup) * kMB << " MB\n";
+            if (m_mem_free_after > 0)
+            {
+                std::cout << "  Ray buffers         : " << (m_mem_free_post_setup - m_mem_free_after) * kMB << " MB\n";
+                std::cout << "  Total used          : " << (m_mem_free_before - m_mem_free_after) * kMB << " MB\n";
+            }
+        }
+        std::cout << std::fixed << std::setprecision(6);
+    }
+    std::cout << "=====================================\n";
+}
+
 double SolTraceSystem::get_sun_plane_area() const
 {
     const LaunchParams &lp = data_manager->launch_params_H;
@@ -752,3 +855,77 @@ double SolTraceSystem::get_sun_plane_area() const
         a.x * b.y - a.y * b.x);
     return static_cast<double>(sqrtf(cross.x * cross.x + cross.y * cross.y + cross.z * cross.z));
 }
+
+uint_fast64_t SolTraceSystem::automatic_batch_size() const
+{
+    // Use the free-memory snapshot taken at the end of initialize(), after all
+    // setup allocations (BVH, pipeline, SBT, etc.) but before any ray buffers.
+    // This gives a stable baseline that does not shrink on subsequent run() calls
+    // due to the already-allocated (and reused) ray buffers being counted as used.
+    const size_t mem_free = m_mem_free_post_setup;
+
+    // Reserve 20 % headroom for OptiX internal allocations, memory
+    // fragmentation, and any other transient allocations during launch.
+    constexpr double kUsableFraction = 0.80;
+    const size_t usable_bytes = static_cast<size_t>(
+        static_cast<double>(mem_free) * kUsableFraction);
+
+    // Per-ray device memory charged by allocate_device_buffers() and
+    // allocate_compaction_scratch():
+    //   hit_buffer      DEFAULT_MAX_TRACE_DEPTH * sizeof(HitRecord)  -- trace output
+    //   d_compacted     DEFAULT_MAX_TRACE_DEPTH * sizeof(HitRecord)  -- worst-case compacted copy
+    //   sun_dir_buffer  sizeof(float3)                       -- sun ray direction
+    //   curand states   sizeof(curandState)                  -- RNG state
+    //   d_offsets       sizeof(uint64_t)                     -- compaction prefix sum / global ray IDs
+    //   d_count         sizeof(uint8_t)                      -- compaction hit count (bounded by DEFAULT_MAX_TRACE_DEPTH <= 255)
+    //   d_has_hit       sizeof(uint8_t)                      -- per-ray hit flag
+    const size_t bytes_per_ray =
+        2u * m_max_ray_depth * sizeof(HitRecord) + sizeof(float3) + sizeof(curandState) + sizeof(uint64_t) + 2u * sizeof(uint8_t);
+
+    const uint_fast64_t computed =
+        (bytes_per_ray > 0) ? static_cast<uint_fast64_t>(usable_bytes / bytes_per_ray) : 0u;
+
+    // Cap at int max / m_max_ray_depth (OptiX launch width is signed int).
+    uint_fast64_t batch_size = std::min(
+        computed,
+        static_cast<uint_fast64_t>(std::numeric_limits<int>::max() / m_max_ray_depth));
+
+    if (m_verbose)
+    {
+        std::cout << "automatic_batch_size:"
+                  << " free=" << mem_free / (1024.0 * 1024.0) << " MB"
+                  << ", usable=" << usable_bytes / (1024.0 * 1024.0) << " MB"
+                  << ", bytes_per_ray=" << bytes_per_ray
+                  << ", batch_size=" << batch_size << "\n";
+    }
+
+    return batch_size;
+}
+
+uint_fast64_t SolTraceSystem::determine_batch_size() const
+{
+    // Estimates number of rays that can be traced in a single batch based on
+    // available GPU memory.
+    uint_fast64_t batch_size = automatic_batch_size();
+
+    if (m_batch_size > 0)
+    {
+        if (m_batch_size > batch_size && batch_size > 0)
+        {
+            std::cerr << "[SolTraceSystem] WARNING: user-supplied batch_size ("
+                      << m_batch_size
+                      << ") exceeds the GPU-memory-safe automatic batch size ("
+                      << batch_size
+                      << "). This may cause device out-of-memory errors or "
+                         "degraded GPU performance.\n";
+        }
+        batch_size = m_batch_size;
+    }
+    else
+    {
+        // Take the smaller of the automatic batch_size and number of rays?
+        batch_size = batch_size > 0 ? std::min(batch_size, m_number_of_rays) : m_number_of_rays;
+    }
+
+    return batch_size;
+}
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h
index 9fcbbb4c..0e13fd1a 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h
@@ -1,22 +1,25 @@
 #pragma once
 
+#include <cstddef>
+#include <cstdio>
+#include <limits>
+#include <memory>
+#include <stdexcept>
 #include <string>
 #include <vector>
-#include <memory>                 
-#include <cstddef>                
-#include <cstdio>                 
-
-
 
 #include "core/soltrace_state.h" // SoltraceState
-#include "core/vec3d.h"      // Vec3d
+#include "core/vec3d.h"          // Vec3d
 #include "core/timer.h"
-#include "core/CspElement.h" // CspElement
-#include "core/Surface.h"    // Surface and derived classes
+#include "core/CspElement.h"  // CspElement
+#include "core/Surface.h"     // Surface and derived classes
+#include "shaders/Soltrace.h" // HitRecord, HitType
+#include "ray_utils.h"        // CompactionScratch
 
 #include "../../../../../simulation_data/simulation_data_export.hpp"
 
-namespace OptixCSP {
+namespace OptixCSP
+{
 
     class GeometryManager;
     class pipelineManager;
@@ -26,17 +29,16 @@ namespace OptixCSP {
     class Surface;
 
     static constexpr SolTrace::Data::SunShape kSupportedSunshapes[] = {
-            SolTrace::Data::SunShape::GAUSSIAN,
-            SolTrace::Data::SunShape::PILLBOX,
-            SolTrace::Data::SunShape::BUIE_CSR,
-            SolTrace::Data::SunShape::LIMBDARKENED,
-            SolTrace::Data::SunShape::USER_DEFINED
-    };
+        SolTrace::Data::SunShape::GAUSSIAN,
+        SolTrace::Data::SunShape::PILLBOX,
+        SolTrace::Data::SunShape::BUIE_CSR,
+        SolTrace::Data::SunShape::LIMBDARKENED,
+        SolTrace::Data::SunShape::USER_DEFINED};
 
-    class SolTraceSystem {
+    class SolTraceSystem
+    {
 
     public:
-        
         SolTraceSystem();
         ~SolTraceSystem();
 
@@ -50,8 +52,8 @@ namespace OptixCSP {
         void update();
 
         // Get all hit points
-        void get_hp_output(std::vector<float4>& hp_vec, std::vector<uint_fast64_t>& raynumber_vec, std::vector<int32_t>& element_id_vec,
-            std::vector<uint8_t>& hit_type_vec);
+        void get_hp_output(std::vector<float4> &hp_vec, std::vector<uint_fast64_t> &raynumber_vec, std::vector<int32_t> &element_id_vec,
+                           std::vector<uint8_t> &hit_type_vec);
 
         /// Explicit cleanup
         void clean_up();
@@ -66,14 +68,31 @@ namespace OptixCSP {
         /// </summary>
         /// <param name="numSunPoints"></param>
         void set_number_of_rays(uint_fast64_t nrays, uint_fast64_t maxrays)
-        { 
+        {
             m_number_of_rays = nrays;
             m_max_number_of_rays = maxrays;
         }
 
-        void set_sun(SolTrace::Data::Sun* sun) { m_sun = sun; }
+        /// Set the number of rays launched per iteration.
+        /// Use 0 (default) to let determine_batch_size() automatically compute a
+        /// batch size that fits the ray-data buffers in available GPU memory.
+        /// Throws std::out_of_range if batch_size exceeds the maximum int value.
+        void set_batch_size(uint_fast64_t batch_size)
+        {
+            if (batch_size > static_cast<uint_fast64_t>(std::numeric_limits<int>::max()))
+                throw std::out_of_range("batch_size exceeds std::numeric_limits<int>::max()");
+            m_batch_size = batch_size;
+        }
+        uint_fast64_t get_batch_size() const { return m_batch_size; }
+
+        /// Set the maximum ray interaction depth. Must be called before initialize().
+        /// Values are clamped to [2, 255]. Defaults to DEFAULT_MAX_TRACE_DEPTH = 5.
+        void set_max_ray_depth(uint64_t depth);
+        uint8_t get_max_ray_depth() const { return m_max_ray_depth; }
+
+        void set_sun(SolTrace::Data::Sun *sun) { m_sun = sun; }
 
-        void set_seed(uint64_t seed) { m_seed = seed; }  // Set sun seed
+        void set_seed(uint64_t seed) { m_seed = seed; } // Set sun seed
 
         void set_optical_errors(bool include_optical_errors)
         {
@@ -88,6 +107,10 @@ namespace OptixCSP {
         double get_time_trace();
         double get_time_setup();
 
+        /// Print a formatted summary of all timing information collected during
+        /// the last initialize() and run() calls.
+        void print_timing() const;
+
         void print_launch_params();
 
         /// <summary>
@@ -96,75 +119,117 @@ namespace OptixCSP {
         /// </summary>
         double get_sun_plane_area() const;
 
-        uint_fast64_t get_N_sun_rays() 
-        { 
-            if (m_sunraynumber_vec.empty())
-                return 0;
-            return m_sunraynumber_vec.back(); 
-        }
+        uint_fast64_t get_N_sun_rays() const { return m_n_sun_rays; }
+
+        /// Returns the number of run() iterations executed during the last run() call.
+        uint64_t get_N_run_iterations() const { return m_n_run_iterations; }
+
+        /// Returns the compacted hit records (CREATE + hits, misses excluded).
+        const std::vector<HitRecord> &get_hit_records() const { return m_hit_records; }
 
-        std::vector<uint_fast64_t> get_sunraynumber_vec() const { return m_sunraynumber_vec; }
+        /// Returns the number of rays that hit at least one element.
+        uint_fast64_t get_N_hit_rays() const { return m_n_hit_rays; }
+
+        /// Returns the number of rays terminated by max depth (excludes absorption at max depth).
+        uint_fast64_t get_N_depth_exceeded_rays() const { return m_n_depth_exceeded_rays; }
         void set_sun_shape_errors(bool flag) { this->m_include_sun_shape_errors = flag; }
 
-        
+        /// Enable or disable trimming excess rays at the end of run() so that
+        /// exactly m_number_of_rays hit rays are returned.  Enabled by default.
+        void set_trim_excess_rays(bool trim) { m_trim_excess_rays = trim; }
+        bool get_trim_excess_rays() const { return m_trim_excess_rays; }
 
     private:
+        // m_verbose and m_state must be declared before the shared_ptr managers so
+        // that they are initialized first (C++ initializes members in declaration order).
+        // GeometryManager and pipelineManager store references/copies of these at
+        // construction time, so they must be valid when the shared_ptrs are built.
+        bool m_verbose = false;
+        OptixCSP::SoltraceState m_state;
 
         std::shared_ptr<GeometryManager> geometry_manager;
         std::shared_ptr<pipelineManager> pipeline_manager;
-        std::shared_ptr<dataManager>     data_manager;
+        std::shared_ptr<dataManager> data_manager;
 
         uint_fast64_t m_number_of_rays;
         uint_fast64_t m_max_number_of_rays;
-
-        bool m_verbose;
+        uint_fast64_t m_batch_size = 0; // 0 means auto-size: determine_batch_size() calls automatic_batch_size()
+        uint8_t m_max_ray_depth = DEFAULT_MAX_TRACE_DEPTH;
+        uint_fast64_t m_n_depth_exceeded_rays = 0; // rays stopped by max depth, not absorption
 
         // Sun
-        //OptixCSP::Vec3d m_sun_vector;
-        //double m_sun_angle;
-        
-        SolTrace::Data::Sun* m_sun;
-        bool m_include_sun_shape_errors = false;
+        // OptixCSP::Vec3d m_sun_vector;
+        // double m_sun_angle;
 
+        SolTrace::Data::Sun *m_sun;
+        bool m_include_sun_shape_errors = false;
+        bool m_trim_excess_rays = true;
 
         uint64_t m_seed = 123456ULL;
-        bool m_optical_errors;
-        OptixCSP::SoltraceState m_state;
+        bool m_optical_errors = false;
 
         // Results
 
-        // Contains information on rays that hit objects
-        std::vector<float4> m_hp_vec;
-        std::vector<uint_fast64_t> m_raynumber_vec;
-        std::vector<int32_t> m_element_id_vec;
-        std::vector<uint8_t> m_hit_type_vec;
-        std::vector<uint_fast64_t> m_sunraynumber_vec;    // This is ID of hit rays out of all generated rays
+        // Compacted hit records: one contiguous array of HitRecord.
+        // Each ray group starts with a HIT_CREATE record followed by its hits.
+        // Rays that produced no hits (CREATE-only) are excluded.
+        std::vector<HitRecord> m_hit_records;
 
-        // Reused host-side scratch buffers for copying launch results back from device.
-        std::vector<float4> m_hp_output_buffer_host;
-        std::vector<int32_t> m_element_id_buffer_host;
-        std::vector<uint8_t> m_hit_type_buffer_host;
+        // Global ray index (ray_offset + local_index) for each logical hit ray in m_hit_records.
+        // Parallel to the logical rays (not records): m_hit_ray_ids.size() == m_n_hit_rays.
+        std::vector<uint64_t> m_hit_ray_ids;
+
+        // Count of rays that produced at least one non-CREATE hit.
+        uint_fast64_t m_n_hit_rays = 0;
+
+        // Total rays generated (launched from the sun plane) across all run() iterations.
+        uint_fast64_t m_n_sun_rays = 0;
 
         // Current allocated device launch buffer sizes.
-        size_t m_hit_point_buffer_size_allocated = 0;
-        size_t m_element_id_buffer_size_allocated = 0;
-        size_t m_hit_type_buffer_size_allocated = 0;
+        size_t m_hit_buffer_size_allocated = 0;
         size_t m_sun_dir_buffer_size_allocated = 0;
 
+        // Pre-allocated device scratch buffers for GPU stream compaction.
+        CompactionScratch m_compaction_scratch;
+        CompactionTimings m_compaction_timings;
+
         std::vector<std::shared_ptr<CspElement>> m_element_list;
         void create_shader_binding_table();
+        void allocate_device_buffers();
         void setup_device_buffer();
-        void get_buffer_results(std::vector<float4>& hp_vec, std::vector<uint_fast64_t>& raynumber_vec, 
-            std::vector<int32_t>& element_id_vec, std::vector<uint8_t>& hit_type_vec, 
-            std::vector<uint_fast64_t>& sunraynumber_vec);
+        // GPU-side compaction: count hits, compact buffer on device, copy result to m_hit_records.
+        // Increments m_n_hit_rays by the number of newly collected hit rays.
+        void get_buffer_results();
+        /// Computes the maximum rays-per-batch that fit in 80 % of current free
+        /// GPU memory, accounting for all per-ray device buffers and compaction
+        /// scratch. Returns 0 if memory cannot be queried.
+        uint_fast64_t automatic_batch_size() const;
+        /// Returns the effective batch size for a run() call.
+        /// If m_batch_size > 0 the user-supplied value is used as-is.
+        /// Otherwise automatic_batch_size() is called and the result is capped
+        /// at m_number_of_rays.
+        uint_fast64_t determine_batch_size() const;
 
         Timer m_timer_setup;
         Timer m_timer_trace;
 
-        // memory usage
-        size_t m_mem_free_before;
-        size_t m_mem_free_after;
+        // initialize() sub-timers
+        Timer m_timer_aabb;
+        Timer m_timer_geometry;
+        Timer m_timer_pipeline;
+        Timer m_timer_sbt;
 
+        // run() sub-timers
+        Timer m_timer_setup_buffer;
+        Timer m_timer_optix_launch;
+        Timer m_timer_collect_results;
+        uint64_t m_n_run_iterations;
 
+        // memory usage
+        size_t m_mem_free_before;     ///< Free GPU memory at the start of initialize(), before any setup allocations.
+        size_t m_mem_free_post_setup; ///< Free GPU memory at the end of initialize(), after all setup allocations (BVH,
+                                      ///  pipeline, SBT, geometry/material arrays). Used as the baseline in
+                                      ///  automatic_batch_size() so batch sizing is stable across run() calls.
+        size_t m_mem_free_after;      ///< Free GPU memory sampled during run() for per-launch memory reporting.
     };
 }
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h
index e10ad3cc..d98569a8 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h
@@ -10,7 +10,9 @@ namespace OptixCSP
 		RECTANGLE,
 		CIRCLE,
 		TRIANGLE,
-		QUADRILATERAL
+		QUADRILATERAL,
+		HEXAGON,
+		ANNULUS
 	};
 
 	// types for both scene building and pipeline assembly
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/sun_utils.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/sun_utils.cu
index 739c2075..798b288d 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/sun_utils.cu
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/sun_utils.cu
@@ -6,7 +6,6 @@
 #include "shaders/Soltrace.h"
 
 #include <cuda_runtime.h>
-#include <iostream>
 
 namespace OptixCSP {
 
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h
index 5abd5009..217e64dd 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h
@@ -8,10 +8,11 @@
 #define assert(x) /*nop*/
 #endif
 
-// TODO: get rid of ST suffix, no clue what it was for ... 
+// TODO: get rid of ST suffix, no clue what it was for ...
 
-namespace OptixCSP {
-   struct GeometryDataST
+namespace OptixCSP
+{
+    struct GeometryDataST
     {
         enum Type
         {
@@ -20,20 +21,21 @@ namespace OptixCSP {
             RECTANGLE_PARABOLIC = 2,
             UNKNOWN_TYPE = 3,
             RECTANGLE_FLAT = 4,
-			TRIANGLE_FLAT = 5,
-            QUADRILATERAL_FLAT = 6
+            TRIANGLE_FLAT = 5,
+            QUADRILATERAL_FLAT = 6,
+            CIRCLE_FLAT = 7,
+            HEXAGON_FLAT = 8,
+	    ANNULUS_FLAT = 9
         };
 
         struct Parallelogram
         {
             Parallelogram() = default;
             Parallelogram(float3 v1, float3 v2, float3 anchor)
-                : v1(v1)
-                , v2(v2)
-                , anchor(anchor)
+                : v1(v1), v2(v2), anchor(anchor)
             {
                 float3 normal = normalize(cross(v1, v2));
-                float  d = dot(normal, anchor);
+                float d = dot(normal, anchor);
                 this->v1 *= 1.0f / dot(v1, v1);
                 this->v2 *= 1.0f / dot(v2, v2);
                 plane = make_float4(normal, d);
@@ -52,7 +54,7 @@ namespace OptixCSP {
                 : center(center), x(x), y(y), width(width), height(height)
             {
                 float3 normal = normalize(cross(x, y));
-                float  d = dot(normal, center);
+                float d = dot(normal, center);
                 plane = make_float4(normal, d);
             }
 
@@ -64,34 +66,28 @@ namespace OptixCSP {
             float height;
         };
 
-        struct Cylinder_Y {
+        struct Cylinder_Y
+        {
             Cylinder_Y() = default;
             Cylinder_Y(float3 center, float radius, float half_height, float3 base_x, float3 base_z)
-                : center(center)
-                , radius(radius)
-                , half_height(half_height)
-                , base_x(base_x)
-                , base_z(base_z) {
+                : center(center), radius(radius), half_height(half_height), base_x(base_x), base_z(base_z)
+            {
                 assert(dot(base_x, base_z) < 1e-3f);
             }
 
-
             float3 center;
             float radius;
             float half_height;
-            float3 base_x;   // x axis of the cylinder
-            float3 base_z;   // z axis of the cylinder
+            float3 base_x; // x axis of the cylinder
+            float3 base_z; // z axis of the cylinder
         };
 
-        struct Rectangle_Parabolic {
+        struct Rectangle_Parabolic
+        {
 
             Rectangle_Parabolic() = default;
             Rectangle_Parabolic(float3 v1, float3 v2, float3 anchor, float curv_x, float curv_y)
-                : v1(v1)
-                , v2(v2)
-                , anchor(anchor)
-                , curv_x(curv_x)
-                , curv_y(curv_y)
+                : v1(v1), v2(v2), anchor(anchor), curv_x(curv_x), curv_y(curv_y)
             {
                 float3 normal = normalize(cross(v1, v2));
                 float d = dot(normal, anchor);
@@ -104,14 +100,15 @@ namespace OptixCSP {
             float3 v1;
             float3 v2;
             float3 anchor;
-            //float3 focus;
+            // float3 focus;
             float curv_x;
             float curv_y;
         };
 
-        struct Triangle_Flat {
+        struct Triangle_Flat
+        {
             Triangle_Flat() = default;
-            Triangle_Flat(const float3& a, const float3& b, const float3& c)
+            Triangle_Flat(const float3 &a, const float3 &b, const float3 &c)
                 : v0(a), e1(b - a), e2(c - a)
             {
                 normal = normalize(cross(e1, e2));
@@ -120,10 +117,11 @@ namespace OptixCSP {
             float3 v0;     // base vertex
             float3 e1, e2; // edges
             float3 normal;
-            float  d;      // plane distance
+            float d; // plane distance
         };
 
-        struct Quadrilateral_Flat{
+        struct Quadrilateral_Flat
+        {
             Quadrilateral_Flat() = default;
             Quadrilateral_Flat(const float3 &a, const float3 &b,
                                const float3 &c, const float3 &d)
@@ -134,75 +132,118 @@ namespace OptixCSP {
                 normal = normalize(cross(e1, e2));
             }
             float3 p0, p1, p2, p3; // Vertices in counterclockwise order
-            float3 normal; // Positive direction follows right-hand rule
+            float3 normal;         // Positive direction follows right-hand rule
+        };
+
+        struct Circle_Flat
+        {
+            Circle_Flat() = default;
+            // Circle_Flat(const float radius) : r(radius) {}
+            Circle_Flat(const float3 &origin, const float3 &normal, const float &radius)
+                : r(radius), center(origin)
+            {
+                plane = make_float4(normalize(normal), dot(center, normal));
+            }
+            float4 plane;
+            float3 center;
+            float r;
+        };
+
+        struct Hexagon_Flat
+        {
+            Hexagon_Flat() = default;
+            Hexagon_Flat(const float3 &origin, const float3 &normal, const float &side_length)
+                : s(side_length), center(origin)
+            {
+                plane = make_float4(normalize(normal), dot(center, normal));
+            }
+            float4 plane;
+            float3 center;
+            float s;
+        };
+
+        struct Annulus_Flat
+        {
+            Annulus_Flat() = default;
+            Annulus_Flat(const float3 &origin, const float3 &normal,
+                         const float &r_inner, const float &r_outer, const float &arc)
+	      : center(origin), ri(r_inner), ro(r_outer), arc(arc)
+            {
+                plane = make_float4(normalize(normal), dot(center, normal));
+            }
+            float4 plane;
+            float3 center;
+            float ri;
+            float ro;
+	  float arc; // Arc angle in radians with x-axis in the middle
         };
 
         GeometryDataST() = default;
 
-        void setParallelogram(const Parallelogram& p)
+        void setParallelogram(const Parallelogram &p)
         {
             assert(type == UNKNOWN_TYPE);
             type = PARALLELOGRAM;
             parallelogram = p;
         }
 
-        __host__ __device__ const Parallelogram& getParallelogram() const
+        __host__ __device__ const Parallelogram &getParallelogram() const
         {
             assert(type == PARALLELOGRAM);
             return parallelogram;
         }
 
-        void setRectangle_Flat(const Rectangle_Flat& r)
+        void setRectangle_Flat(const Rectangle_Flat &r)
         {
             assert(type == UNKNOWN_TYPE);
             type = RECTANGLE_FLAT;
             rectangle_flat = r;
         }
 
-        __host__ __device__ const Rectangle_Flat& getRectangle_Flat() const
+        __host__ __device__ const Rectangle_Flat &getRectangle_Flat() const
         {
             assert(type == RECTANGLE_FLAT);
             return rectangle_flat;
         }
 
-        void setCylinder_Y(const Cylinder_Y& c)
+        void setCylinder_Y(const Cylinder_Y &c)
         {
             assert(type == UNKNOWN_TYPE);
             type = CYLINDER_Y;
             cylinder_y = c;
         }
 
-        __host__ __device__ const Cylinder_Y& getCylinder_Y() const
+        __host__ __device__ const Cylinder_Y &getCylinder_Y() const
         {
             assert(type == CYLINDER_Y);
             return cylinder_y;
         }
 
-        void setRectangleParabolic(const Rectangle_Parabolic& r)
+        void setRectangleParabolic(const Rectangle_Parabolic &r)
         {
             assert(type == UNKNOWN_TYPE);
             type = RECTANGLE_PARABOLIC;
             rectangle_parabolic = r;
         }
 
-        __host__ __device__ const Rectangle_Parabolic& getRectangleParabolic() const
+        __host__ __device__ const Rectangle_Parabolic &getRectangleParabolic() const
         {
             assert(type == RECTANGLE_PARABOLIC);
             return rectangle_parabolic;
         }
 
-        void setTriangle_Flat(const Triangle_Flat& t)
+        void setTriangle_Flat(const Triangle_Flat &t)
         {
             assert(type == UNKNOWN_TYPE);
             type = TRIANGLE_FLAT;
             triangle_flat = t;
-		}
+        }
 
-        __host__ __device__ const Triangle_Flat& getTriangle_Flat() const
+        __host__ __device__ const Triangle_Flat &getTriangle_Flat() const
         {
             assert(type == TRIANGLE_FLAT);
             return triangle_flat;
-		}
+        }
 
         void setQuadrilateral_Flat(const Quadrilateral_Flat &q)
         {
@@ -211,12 +252,50 @@ namespace OptixCSP {
             quadrilateral_flat = q;
         }
 
-        __host__ __device__ const Quadrilateral_Flat& getQuadrilateral_Flat() const
+        __host__ __device__ const Quadrilateral_Flat &getQuadrilateral_Flat() const
         {
             assert(type == QUADRILATERAL_FLAT);
             return quadrilateral_flat;
         }
 
+        void setCircle_Flat(const Circle_Flat &c)
+        {
+            assert(type == UNKNOWN_TYPE);
+            type = CIRCLE_FLAT;
+            circle_flat = c;
+        }
+
+        __host__ __device__ const Circle_Flat &getCircle_Flat() const
+        {
+            assert(type == CIRCLE_FLAT);
+            return circle_flat;
+        }
+
+        void setHexagon_Flat(const Hexagon_Flat &h)
+        {
+            assert(type == UNKNOWN_TYPE);
+            type = HEXAGON_FLAT;
+            hexagon_flat = h;
+        }
+
+        __host__ __device__ const Hexagon_Flat &getHexagon_Flat() const
+        {
+            assert(type == HEXAGON_FLAT);
+            return hexagon_flat;
+        }
+
+        void setAnnulus_Flat(const Annulus_Flat &anf)
+        {
+            assert(type == UNKNOWN_TYPE);
+            type = ANNULUS_FLAT;
+            annulus_flat = anf;
+        }
+
+        __host__ __device__ const Annulus_Flat &getAnnulus_Flat() const
+        {
+            assert(type == ANNULUS_FLAT);
+            return annulus_flat;
+        }
 
         Type type = UNKNOWN_TYPE;
 
@@ -229,8 +308,11 @@ namespace OptixCSP {
             Cylinder_Y cylinder_y;
             Rectangle_Parabolic rectangle_parabolic;
             Rectangle_Flat rectangle_flat;
-			Triangle_Flat triangle_flat;
+            Triangle_Flat triangle_flat;
             Quadrilateral_Flat quadrilateral_flat;
+            Circle_Flat circle_flat;
+            Hexagon_Flat hexagon_flat;
+            Annulus_Flat annulus_flat;
         };
     };
 }
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h
index 42e3838b..1c4c87ab 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h
@@ -4,6 +4,7 @@
 #include "MaterialDataST.h"
 #include "soltrace_constants.h"
 
+#include <cstdint>
 #include <vector_types.h>
 #include <optix.h>
 #include <curand_kernel.h>
@@ -12,13 +13,22 @@ namespace OptixCSP{
 
     const unsigned int NUM_ATTRIBUTE_VALUES = 4u;
     const unsigned int NUM_PAYLOAD_VALUES   = 2u;
-    const unsigned int MAX_TRACE_DEPTH      = 5u;
+    // NOTE: Maximum number of ray interactions in tracing with the geometry is
+    // DEFAULT_MAX_TRACE_DEPTH - 1 (so currently 4). See the end of the function
+    // __closesthit__element in materials.cu. Note the type. Limited to 255.
+    const uint8_t DEFAULT_MAX_TRACE_DEPTH = 5u;
 
     struct HitGroupData
     {
         MaterialData material_data;
     };
 
+    struct HitRecord {
+        float4 hit_point;
+        int32_t element_id;
+        uint8_t hit_type;
+    };
+
     enum RayType
     {
         RAY_TYPE_RADIANCE = 0,
@@ -31,6 +41,9 @@ namespace OptixCSP{
         CYLINDRICAL          = 2,
         TRIANGLE_FLAT        = 3,
         QUADRILATERAL_FLAT   = 4,
+        CIRCLE_FLAT          = 5,
+        HEXAGON_FLAT         = 6,
+        ANNULUS_FLAT         = 7,
 	    NUM_OPTICAL_ENTITY_TYPES
     };
 
@@ -40,15 +53,18 @@ namespace OptixCSP{
 
         unsigned int                width;   // essentially number of rays launched and sun points 
         unsigned int                height;
-        int                         max_depth;
-        unsigned int                ray_offset; // Global offset for current branch
+        unsigned int                max_depth;
+        unsigned long long          ray_offset; // Global offset for current branch
 
-        float4*                     hit_point_buffer;
+        // float4*                     hit_point_buffer;
+        HitRecord*                  hit_buffer;
         float3*                     sun_dir_buffer;
         curandState*                rng_states;
         OptixTraversableHandle      handle;
-        int32_t*                    element_id_buffer;
-        uint8_t*                    hit_type_buffer;
+        // int32_t*                    element_id_buffer;
+        // uint8_t*                    hit_type_buffer;
+        uint64_t*                   d_depth_exceeded_count; // Atomic counter: rays stopped by max depth, not absorption
+
 
         float3                      sun_vector;
         bool                        include_sun_shape_errors;
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu
index 6b25206d..b9688d39 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu
@@ -1,46 +1,54 @@
 #include <optix.h>
-//#include <cuda/helpers.h>
+// #include <cuda/helpers.h>
 #include "Soltrace.h"
 #include <stdio.h>
 #include "GeometryDataST.h"
 
-extern "C" {
+extern "C"
+{
     __constant__ OptixCSP::LaunchParams params;
 }
 
+extern "C" __device__ __inline__ float ray_distance_to_plane(float3 ro, float3 rd, float4 plane)
+{
+    const float3 n = make_float3(plane);
+    return (plane.w - dot(n, ro)) / dot(rd, n);
+}
 
 extern "C" __global__ void __intersection__parallelogram()
 {
-	int i = optixGetPrimitiveIndex();
-    const OptixCSP::GeometryDataST::Parallelogram& parallelogram = params.geometry_data_array[i].getParallelogram();
-        
+    int i = optixGetPrimitiveIndex();
+    const OptixCSP::GeometryDataST::Parallelogram &parallelogram = params.geometry_data_array[i].getParallelogram();
+
     // Get ray information: origin, direction, and min/max distances over which ray should be tested
     const float3 ray_orig = optixGetWorldRayOrigin();
-    const float3 ray_dir  = optixGetWorldRayDirection();
-    const float  ray_tmin = optixGetRayTmin(), ray_tmax = optixGetRayTmax();
+    const float3 ray_dir = optixGetWorldRayDirection();
+    const float ray_tmin = optixGetRayTmin(), ray_tmax = optixGetRayTmax();
 
-    // Compute ray intersection point
-    float3 n  = make_float3( parallelogram.plane );
-    float  dt = dot( ray_dir, n );
-    // Compute distance t (point of intersection) along ray direction from ray origin
-    float  t  = ( parallelogram.plane.w - dot( n, ray_orig ) ) / dt;
+    // // Compute ray intersection point
+    // float3 n  = make_float3( parallelogram.plane );
+    // float  dt = dot( ray_dir, n );
+    // // Compute distance t (point of intersection) along ray direction from ray origin
+    // float  t  = ( parallelogram.plane.w - dot( n, ray_orig ) ) / dt;
+    float t = ray_distance_to_plane(ray_orig, ray_dir, parallelogram.plane);
+    const float4 n = parallelogram.plane;
 
     // Verify intersection distance and Report ray intersection point
-    if( t > ray_tmin && t < ray_tmax )
+    if (t > ray_tmin && t < ray_tmax)
     {
-        float3 p  = ray_orig + ray_dir * t;
+        float3 p = ray_orig + ray_dir * t;
         float3 vi = p - parallelogram.anchor;
-        float  a1 = dot( parallelogram.v1, vi );
-        if( a1 >= 0 && a1 <= 1 )
+        float a1 = dot(parallelogram.v1, vi);
+        if (a1 >= 0 && a1 <= 1)
         {
-            float a2 = dot( parallelogram.v2, vi );
-            if( a2 >= 0 && a2 <= 1 )
+            float a2 = dot(parallelogram.v2, vi);
+            if (a2 >= 0 && a2 <= 1)
             {
-                optixReportIntersection( t, 
-                    0, 
-                    __float_as_uint( n.x ), 
-                    __float_as_uint( n.y ), 
-                    __float_as_uint( n.z ));
+                optixReportIntersection(t,
+                                        0,
+                                        __float_as_uint(n.x),
+                                        __float_as_uint(n.y),
+                                        __float_as_uint(n.z));
             }
         }
     }
@@ -49,61 +57,63 @@ extern "C" __global__ void __intersection__parallelogram()
 extern "C" __global__ void __intersection__rectangle_flat()
 {
 
-	const OptixCSP::GeometryDataST::Rectangle_Flat& rectangle = params.geometry_data_array[optixGetPrimitiveIndex()].getRectangle_Flat();
-        
+    const OptixCSP::GeometryDataST::Rectangle_Flat &rectangle = params.geometry_data_array[optixGetPrimitiveIndex()].getRectangle_Flat();
+
     const float3 ray_orig = optixGetWorldRayOrigin();
     const float3 ray_dir = optixGetWorldRayDirection();
     const float ray_tmin = optixGetRayTmin();
     const float ray_tmax = optixGetRayTmax();
 
-    // Get plane normal and distance
-    float3 n = make_float3(rectangle.plane);
-    float dt = dot(ray_dir, n);
-    
-    // Compute distance t (point of intersection) along ray direction from ray origin
-    float t = (rectangle.plane.w - dot(n, ray_orig)) / dt;
+    // // Get plane normal and distance
+    // float3 n = make_float3(rectangle.plane);
+    // float dt = dot(ray_dir, n);
+
+    // // Compute distance t (point of intersection) along ray direction from ray origin
+    // float t = (rectangle.plane.w - dot(n, ray_orig)) / dt;
+    float t = ray_distance_to_plane(ray_orig, ray_dir, rectangle.plane);
+    const float4 n = rectangle.plane;
 
     // Verify intersection distance
     if (t > ray_tmin && t < ray_tmax)
     {
         // Compute intersection point
         float3 p = ray_orig + ray_dir * t;
-        
+
         // Compute vector from center to intersection point
         float3 v = p - rectangle.center;
-        
+
         // Project onto x and y to get local coordinates
         float x = dot(rectangle.x, v);
         float y = dot(rectangle.y, v);
-        
+
         // Check if point is within rectangle bounds
-        if (x >= -rectangle.width/2 && x <= rectangle.width/2 &&
-            y >= -rectangle.height/2 && y <= rectangle.height/2)
+        if (x >= -rectangle.width / 2 && x <= rectangle.width / 2 &&
+            y >= -rectangle.height / 2 && y <= rectangle.height / 2)
         {
             optixReportIntersection(t,
-                0,
-                __float_as_uint(n.x),
-                __float_as_uint(n.y),
-                __float_as_uint(n.z));
+                                    0,
+                                    __float_as_uint(n.x),
+                                    __float_as_uint(n.y),
+                                    __float_as_uint(n.z));
         }
     }
 }
 
 extern "C" __global__ void __intersection__cylinder_y()
 {
-	const OptixCSP::GeometryDataST::Cylinder_Y& cyl = params.geometry_data_array[optixGetPrimitiveIndex()].getCylinder_Y();
+    const OptixCSP::GeometryDataST::Cylinder_Y &cyl = params.geometry_data_array[optixGetPrimitiveIndex()].getCylinder_Y();
 
     // Get ray information: origin, direction, and min/max distances over which ray should be tested
     const float3 ray_orig = optixGetWorldRayOrigin();
     const float3 ray_dir = normalize(optixGetWorldRayDirection());
-    const float  ray_tmin = optixGetRayTmin();
-    const float  ray_tmax = optixGetRayTmax();
+    const float ray_tmin = optixGetRayTmin();
+    const float ray_tmax = optixGetRayTmax();
 
     // Transform ray to the cylinder's local coordinate system
     float3 local_ray_orig = ray_orig - cyl.center;
     float3 local_ray_dir = ray_dir;
 
-	// TODO: check how to optimize this, there should be a way in optix to rotate coordinates 
+    // TODO: check how to optimize this, there should be a way in optix to rotate coordinates
     float3 local_x = cyl.base_x;
     float3 local_z = cyl.base_z;
     float3 local_y = cross(local_z, local_x);
@@ -111,15 +121,13 @@ extern "C" __global__ void __intersection__cylinder_y()
     local_ray_orig = make_float3(
         dot(local_ray_orig, local_x),
         dot(local_ray_orig, local_y),
-        dot(local_ray_orig, local_z)
-    );
+        dot(local_ray_orig, local_z));
     local_ray_dir = make_float3(
         dot(local_ray_dir, local_x),
         dot(local_ray_dir, local_y),
-        dot(local_ray_dir, local_z)
-    );
+        dot(local_ray_dir, local_z));
 
-	// solve quadratic equation for intersection
+    // solve quadratic equation for intersection
     float A = local_ray_dir.x * local_ray_dir.x + local_ray_dir.z * local_ray_dir.z;
     float B = 2.0f * (local_ray_orig.x * local_ray_dir.x + local_ray_orig.z * local_ray_dir.z);
     float C = local_ray_orig.x * local_ray_orig.x + local_ray_orig.z * local_ray_orig.z - cyl.radius * cyl.radius;
@@ -169,23 +177,23 @@ extern "C" __global__ void __intersection__cylinder_y()
 
     // Report intersection to OptiX
     optixReportIntersection(t,
-        0,
-        __float_as_uint(world_normal.x),
-        __float_as_uint(world_normal.y),
-        __float_as_uint(world_normal.z));
+                            0,
+                            __float_as_uint(world_normal.x),
+                            __float_as_uint(world_normal.y),
+                            __float_as_uint(world_normal.z));
 }
 
-// ray cylinder intersection with top and bottom caps 
-// it can also be modeled as cylinder with two disks. 
+// ray cylinder intersection with top and bottom caps
+// it can also be modeled as cylinder with two disks.
 extern "C" __global__ void __intersection__cylinder_y_capped()
 {
-	const OptixCSP::GeometryDataST::Cylinder_Y& cyl = params.geometry_data_array[optixGetPrimitiveIndex()].getCylinder_Y();
+    const OptixCSP::GeometryDataST::Cylinder_Y &cyl = params.geometry_data_array[optixGetPrimitiveIndex()].getCylinder_Y();
 
     // Get ray information: origin, direction, and min/max distances over which ray should be tested
     const float3 ray_orig = optixGetWorldRayOrigin();
     const float3 ray_dir = normalize(optixGetWorldRayDirection());
-    const float  ray_tmin = optixGetRayTmin();
-    const float  ray_tmax = optixGetRayTmax();
+    const float ray_tmin = optixGetRayTmin();
+    const float ray_tmax = optixGetRayTmax();
 
     // Transform ray to the cylinder's local coordinate system
     float3 local_ray_orig = ray_orig - cyl.center;
@@ -199,13 +207,11 @@ extern "C" __global__ void __intersection__cylinder_y_capped()
     local_ray_orig = make_float3(
         dot(local_ray_orig, local_x),
         dot(local_ray_orig, local_y),
-        dot(local_ray_orig, local_z)
-    );
+        dot(local_ray_orig, local_z));
     local_ray_dir = make_float3(
         dot(local_ray_dir, local_x),
         dot(local_ray_dir, local_y),
-        dot(local_ray_dir, local_z)
-    );
+        dot(local_ray_dir, local_z));
 
     // Solve quadratic equation for intersection with curved surface
     float A = local_ray_dir.x * local_ray_dir.x + local_ray_dir.z * local_ray_dir.z;
@@ -240,7 +246,7 @@ extern "C" __global__ void __intersection__cylinder_y_capped()
         {
             float t = (-cyl.half_height - local_ray_orig.y) / local_ray_dir.y;
             float2 hit_point = make_float2(local_ray_orig.x + t * local_ray_dir.x,
-                local_ray_orig.z + t * local_ray_dir.z);
+                                           local_ray_orig.z + t * local_ray_dir.z);
             if (t > ray_tmin && t < ray_tmax && dot(hit_point, hit_point) <= cyl.radius * cyl.radius)
             {
                 t_caps = t;
@@ -252,7 +258,7 @@ extern "C" __global__ void __intersection__cylinder_y_capped()
         {
             float t = (cyl.half_height - local_ray_orig.y) / local_ray_dir.y;
             float2 hit_point = make_float2(local_ray_orig.x + t * local_ray_dir.x,
-                local_ray_orig.z + t * local_ray_dir.z);
+                                           local_ray_orig.z + t * local_ray_dir.z);
             if (t > ray_tmin && t < ray_tmax && dot(hit_point, hit_point) <= cyl.radius * cyl.radius)
             {
                 t_caps = fminf(t_caps, t);
@@ -294,12 +300,10 @@ extern "C" __global__ void __intersection__cylinder_y_capped()
         0, // User-defined instance ID or custom data
         __float_as_uint(world_normal.x),
         __float_as_uint(world_normal.y),
-        __float_as_uint(world_normal.z)
-    );
+        __float_as_uint(world_normal.z));
 }
 
-
-// For a parabolic surface rectangle aperture where 
+// For a parabolic surface rectangle aperture where
 // the base (normal projection) is defined by the center and its two unit edge vectors
 // In a local coordinate system (with origin at the anchor) the flat rectangle covers:
 //    x in [0, L1]  and  y in [0, L2],
@@ -320,12 +324,12 @@ extern "C" __global__ void __intersection__cylinder_y_capped()
 // The local hit point is then transformed back to world space for reporting.
 extern "C" __global__ void __intersection__rectangle_parabolic()
 {
-    const OptixCSP::GeometryDataST::Rectangle_Parabolic& rect = params.geometry_data_array[optixGetPrimitiveIndex()].getRectangleParabolic();
+    const OptixCSP::GeometryDataST::Rectangle_Parabolic &rect = params.geometry_data_array[optixGetPrimitiveIndex()].getRectangleParabolic();
     // Get ray information.
     const float3 ray_orig = optixGetWorldRayOrigin();
     const float3 ray_dir = optixGetWorldRayDirection();
-    const float  ray_tmin = optixGetRayTmin();
-    const float  ray_tmax = optixGetRayTmax();
+    const float ray_tmin = optixGetRayTmin();
+    const float ray_tmax = optixGetRayTmax();
 
     //
     // Build the local coordinate system.
@@ -396,9 +400,11 @@ extern "C" __global__ void __intersection__rectangle_parabolic()
             }
         }
     }
-    else {
+    else
+    {
         float discr = B * B - 4.0f * A * C;
-        if (discr >= 0.0f) {
+        if (discr >= 0.0f)
+        {
             float sqrt_discr = sqrtf(discr);
             t1 = -0.5f * (B + sqrt_discr) / A;
             t2 = -0.5f * (B - sqrt_discr) / A;
@@ -443,12 +449,12 @@ extern "C" __global__ void __intersection__rectangle_parabolic()
     //    N_local = (-f_x, -f_y, 1) = ( -curv_x*x_hit, -curv_y*y_hit, 1 ).
     //
     float3 N_local = normalize(make_float3(-curv_x * x_hit,
-        -curv_y * y_hit,
-        1.0f));
+                                           -curv_y * y_hit,
+                                           1.0f));
     // Transform the normal back to world coordinates.
     float3 world_normal = normalize(N_local.x * e1 +
-        N_local.y * e2 +
-        N_local.z * n);
+                                    N_local.y * e2 +
+                                    N_local.z * n);
 
     // Compute the hit point in world space.
     float3 world_hit = ray_orig + t * ray_dir;
@@ -457,19 +463,19 @@ extern "C" __global__ void __intersection__rectangle_parabolic()
     // Here, the two reported extra attributes are the parametric coordinates (a1, a2),
     // encoded as unsigned integers.
     optixReportIntersection(t, 0,
-        __float_as_uint(world_normal.x),
-        __float_as_uint(world_normal.y),
-        __float_as_uint(world_normal.z));    
+                            __float_as_uint(world_normal.x),
+                            __float_as_uint(world_normal.y),
+                            __float_as_uint(world_normal.z));
 }
 
 // intersection algorithm for a flat triangle based on "Fast, Minimum Storage Ray/Triangle Intersection" by M�ller and Trumbore (1997)
-// code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm 
+// code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm
 extern "C" __device__ __inline__ float _triangle_intersect(
     float3 p0, float3 edge1, float3 edge2,
     float3 ro, float3 rd)
 {
     const float3 pvec = cross(rd, edge2);
-    const float  det = dot(edge1, pvec);
+    const float det = dot(edge1, pvec);
 
     // // Backface culling + parallel rejection
     // // (det must be strictly positive and not tiny)
@@ -479,26 +485,28 @@ extern "C" __device__ __inline__ float _triangle_intersect(
     // Parallel rejection
     // (det must be not tiny)
     const float eps = 1e-8f;
-    if (fabs(det) <= eps) return -1.0f;
-
+    if (fabs(det) <= eps)
+        return -1.0f;
 
     const float inv_det = 1.0f / det;
 
     const float3 tvec = ro - p0;
-    const float  u = dot(tvec, pvec) * inv_det;
-    if (u < 0.0f || u > 1.0f) return -1.0f;
+    const float u = dot(tvec, pvec) * inv_det;
+    if (u < 0.0f || u > 1.0f)
+        return -1.0f;
 
     const float3 qvec = cross(tvec, edge1);
-    const float  v = dot(rd, qvec) * inv_det;
-    if (v < 0.0f || (u + v) > 1.0f) return -1.0f;
+    const float v = dot(rd, qvec) * inv_det;
+    if (v < 0.0f || (u + v) > 1.0f)
+        return -1.0f;
 
-    const float  t = dot(edge2, qvec) * inv_det;
+    const float t = dot(edge2, qvec) * inv_det;
 
     return t;
 }
 
 // // intersection algorithm for a flat triangle based on "Fast, Minimum Storage Ray/Triangle Intersection" by M�ller and Trumbore (1997)
-// // code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm 
+// // code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm
 // extern "C" __global__ void __intersection__triangle_flat()
 // {
 // 	const OptixCSP::GeometryDataST::Triangle_Flat& tri = params.geometry_data_array[optixGetPrimitiveIndex()].getTriangle_Flat();
@@ -511,7 +519,6 @@ extern "C" __device__ __inline__ float _triangle_intersect(
 //     const float3 edge1 = tri.e1;
 //     const float3 edge2 = tri.e2;
 
-
 //     const float3 pvec = cross(rd, edge2);
 //     const float  det = dot(edge1, pvec);
 
@@ -528,7 +535,7 @@ extern "C" __device__ __inline__ float _triangle_intersect(
 
 //     const float3 qvec = cross(tvec, edge1);
 //     const float  v = dot(rd, qvec) * inv_det;
-//     if (v < 0.0f || (u + v) > 1.0f) 
+//     if (v < 0.0f || (u + v) > 1.0f)
 //         return;
 
 //     const float  t = dot(edge2, qvec) * inv_det;
@@ -544,31 +551,30 @@ extern "C" __device__ __inline__ float _triangle_intersect(
 // }
 
 // intersection algorithm for a flat triangle based on "Fast, Minimum Storage Ray/Triangle Intersection" by M�ller and Trumbore (1997)
-// code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm 
+// code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm
 extern "C" __global__ void __intersection__triangle_flat()
 {
-	const OptixCSP::GeometryDataST::Triangle_Flat& tri = params.geometry_data_array[optixGetPrimitiveIndex()].getTriangle_Flat();
+    const OptixCSP::GeometryDataST::Triangle_Flat &tri = params.geometry_data_array[optixGetPrimitiveIndex()].getTriangle_Flat();
 
     const float3 ro = optixGetObjectRayOrigin();
     const float3 rd = optixGetObjectRayDirection();
 
     const float t = _triangle_intersect(tri.v0, tri.e1, tri.e2, ro, rd);
 
-    if (t < optixGetRayTmin() || t > optixGetRayTmax()) return;
+    if (t < optixGetRayTmin() || t > optixGetRayTmax())
+        return;
 
     float3 world_normal = tri.normal;
 
     optixReportIntersection(t, 0,
-        __float_as_uint(world_normal.x),
-        __float_as_uint(world_normal.y),
-        __float_as_uint(world_normal.z));
-
+                            __float_as_uint(world_normal.x),
+                            __float_as_uint(world_normal.y),
+                            __float_as_uint(world_normal.z));
 }
 
-
 extern "C" __global__ void __intersection__quadrilateral_flat()
 {
-    const OptixCSP::GeometryDataST::Quadrilateral_Flat& quad = params.geometry_data_array[optixGetPrimitiveIndex()].getQuadrilateral_Flat();
+    const OptixCSP::GeometryDataST::Quadrilateral_Flat &quad = params.geometry_data_array[optixGetPrimitiveIndex()].getQuadrilateral_Flat();
 
     const float3 ro = optixGetObjectRayOrigin();
     const float3 rd = optixGetObjectRayDirection();
@@ -587,12 +593,138 @@ extern "C" __global__ void __intersection__quadrilateral_flat()
         t = _triangle_intersect(p2, e1, e2, ro, rd);
     }
 
-    if (t < optixGetRayTmin() || t > optixGetRayTmax()) return;
+    if (t < optixGetRayTmin() || t > optixGetRayTmax())
+        return;
 
     float3 world_normal = quad.normal;
 
     optixReportIntersection(t, 0,
-        __float_as_uint(world_normal.x),
-        __float_as_uint(world_normal.y),
-        __float_as_uint(world_normal.z));
+                            __float_as_uint(world_normal.x),
+                            __float_as_uint(world_normal.y),
+                            __float_as_uint(world_normal.z));
+}
+
+extern "C" __global__ void __intersection__circle_flat()
+{
+    const OptixCSP::GeometryDataST::Circle_Flat &circ = params.geometry_data_array[optixGetPrimitiveIndex()].getCircle_Flat();
+
+    // Get ray information: origin, direction, and min/max distances over which ray should be tested
+    const float3 ray_orig = optixGetWorldRayOrigin();
+    const float3 ray_dir = optixGetWorldRayDirection();
+    const float ray_tmin = optixGetRayTmin(), ray_tmax = optixGetRayTmax();
+
+    // // Compute ray intersection point
+    // float3 n  = make_float3( circ.plane );
+    // float  dt = dot( ray_dir, n );
+    // // Compute distance t (point of intersection) along ray direction from ray origin
+    // float  t  = ( circ.plane.w - dot( n, ray_orig ) ) / dt;
+    float t = ray_distance_to_plane(ray_orig, ray_dir, circ.plane);
+    const float4 n = circ.plane;
+
+    // Verify intersection distance and Report ray intersection point
+    if (t > ray_tmin && t < ray_tmax)
+    {
+        float3 p = ray_orig + ray_dir * t;
+        float d = length(p - circ.center);
+        if (d <= circ.r)
+        {
+            optixReportIntersection(t,
+                                    0,
+                                    __float_as_uint(n.x),
+                                    __float_as_uint(n.y),
+                                    __float_as_uint(n.z));
+        }
+    }
+}
+
+extern "C" __global__ void __intersection__hexagon_flat()
+{
+    const OptixCSP::GeometryDataST::Hexagon_Flat &hex = params.geometry_data_array[optixGetPrimitiveIndex()].getHexagon_Flat();
+
+    // Get ray information: origin, direction, and min/max distances over which ray should be tested
+    const float3 ray_orig = optixGetWorldRayOrigin();
+    const float3 ray_dir = optixGetWorldRayDirection();
+    const float ray_tmin = optixGetRayTmin(), ray_tmax = optixGetRayTmax();
+
+    float t = ray_distance_to_plane(ray_orig, ray_dir, hex.plane);
+    const float4 n = hex.plane;
+
+    // Verify intersection distance and Report ray intersection point
+    if (t > ray_tmin && t < ray_tmax)
+    {
+        // TODO: Need to adjust for possible rotation...
+        bool is_in = false;
+        float3 p = ray_orig + ray_dir * t - hex.center;
+        // float d = length(p - circ.center);
+        float s = hex.s;
+        float xl = 0.5f * s;
+        float yl = 0.5f * sqrtf(3.0f) * s;
+        if (-xl <= p.x && p.x <= xl && -yl <= p.y && p.y <= yl)
+        {
+            // Center
+            is_in = true;
+        }
+        else if (-s <= p.x && p.x < xl)
+        {
+            // Left side
+            float y1 = sqrtf(3.0f) * (p.x + s);
+            float y2 = -y1;
+            if (y2 <= p.y && p.y <= y1)
+            {
+                is_in = true;
+            }
+        }
+        else if (xl < p.x && p.x <= s)
+        {
+            // Right side
+            float y1 = sqrtf(3.0f) * (p.x - s);
+            float y2 = -y1;
+            if (y1 <= p.y && p.y <= y2)
+            {
+                is_in = true;
+            }
+        }
+
+        if (is_in)
+        {
+            optixReportIntersection(t,
+                                    0,
+                                    __float_as_uint(n.x),
+                                    __float_as_uint(n.y),
+                                    __float_as_uint(n.z));
+        }
+    }
+}
+
+extern "C" __global__ void __intersection__annulus_flat()
+{
+    const OptixCSP::GeometryDataST::Annulus_Flat &anf = params.geometry_data_array[optixGetPrimitiveIndex()].getAnnulus_Flat();
+
+    // Get ray information: origin, direction, and min/max distances over which ray should be tested
+    const float3 ray_orig = optixGetWorldRayOrigin();
+    const float3 ray_dir = optixGetWorldRayDirection();
+    const float ray_tmin = optixGetRayTmin(), ray_tmax = optixGetRayTmax();
+
+    float t = ray_distance_to_plane(ray_orig, ray_dir, anf.plane);
+    const float4 n = anf.plane;
+
+    // Verify intersection distance and Report ray intersection point
+    if (t > ray_tmin && t < ray_tmax)
+    {
+        // TODO: Need to adjust for possible rotation...
+        float3 p = ray_orig + ray_dir * t - anf.center;
+        float d = length(p);
+        if (anf.ri <= d && d <= anf.ro)
+        {
+            float theta = atan2f(p.y, p.x);
+            if (fabsf(theta) <= 0.5f * anf.arc)
+            {
+                optixReportIntersection(t,
+                                        0,
+                                        __float_as_uint(n.x),
+                                        __float_as_uint(n.y),
+                                        __float_as_uint(n.z));
+            }
+        }
+    }
 }
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu
index 0c896c55..004dbf73 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu
@@ -125,7 +125,7 @@ extern "C" __global__ void __closesthit__element()
     const float3 hit_point = ray_orig + ray_t * ray_dir;
 
     OptixCSP::PerRayData prd = OptixCSP::getPayload();
-    const int new_depth = prd.depth + 1; // Increment the ray depth for recursive tracing
+    const unsigned int new_depth = prd.depth + 1; // Increment the ray depth for recursive tracing
 
     // we have two scenarios here
     // if we use refraction, then we look at transmissivity to determine if the ray will refract
@@ -241,17 +241,17 @@ extern "C" __global__ void __closesthit__element()
     {
 
         // Get buffer slot
-        const int slot = params.max_depth * prd.ray_path_index + new_depth;
+        const unsigned int slot = params.max_depth * prd.ray_path_index + new_depth;
 
         // Store the hit point in the hit point buffer (used for visualization or further calculations)
-        params.hit_point_buffer[slot] = make_float4(new_depth, hit_point);
+        params.hit_buffer[slot].hit_point = make_float4(new_depth, hit_point);
 
         // Store element id
         const int32_t elementId = params.geometry_data_array[optixGetPrimitiveIndex()].id;
-        params.element_id_buffer[slot] = elementId;
+        params.hit_buffer[slot].element_id = elementId;
 
         // Store hit type
-        params.hit_type_buffer[slot] = hit_type;
+        params.hit_buffer[slot].hit_type = hit_type;
 
         // Store the reflected direction in its buffer (used for visualization or further calculations)
         /*
@@ -285,6 +285,11 @@ extern "C" __global__ void __closesthit__element()
             prd.depth = params.max_depth; // terminate the ray by setting depth to max depth
         }
     }
+    else if (!absorbed)
+    {
+        // Ray hit an element but max depth was reached; count it (absorption at this depth does not count).
+        atomicAdd(reinterpret_cast<unsigned long long*>(params.d_depth_exceeded_count), 1ULL);
+    }
 
     setPayload(prd);
 }
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu
index 1d5cf084..fd83a085 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu
@@ -2,27 +2,29 @@
 #include <curand_kernel.h>
 #include <vector_types.h>
 
-// todo: move curand initializatin to global function
-
-//#include <cuda/helpers.h>
-//#include <cuda/random.h>
+// #include <cuda/helpers.h>
+// #include <cuda/random.h>
 #include "Soltrace.h"
 #include "soltrace_constants.h"
 
 #include <cstdio>
 
 // Launch parameters for soltrace
-extern "C" {
+extern "C"
+{
     __constant__ OptixCSP::LaunchParams params;
 }
 
-namespace OptixCSP {
+namespace OptixCSP
+{
 
     // Halton sequence generator, used for quasi-random sampling
     // Generates a Halton sequence value for a given index and base
-    __device__ float halton(int index, int base) {
+    __device__ float halton(unsigned int index, unsigned int base)
+    {
         float f = 1.0f, result = 0.0f;
-        while (index > 0) {
+        while (index > 0)
+        {
             f = f / base;
             result = result + f * (index % base);
             index = index / base;
@@ -32,7 +34,8 @@ namespace OptixCSP {
 
     // Generate a sample point within a parallelogram defined by the AABB (Axis-Aligned Bounding Box)
     // Uses the Halton sequence for sampling
-    __device__ float3 haltonSampleInParallelogram(unsigned int sample_index) {
+    __device__ float3 haltonSampleInParallelogram(unsigned int sample_index)
+    {
         // Generate Halton sequence values
         float u = halton(sample_index, 2); // Base 2 for x
         float v = halton(sample_index, 3); // Base 3 for y
@@ -60,7 +63,8 @@ namespace OptixCSP {
     }
 
     // Sample a random ray direction within a cone defined by a maximum angle
-    __device__ float3 sampleRayDirectionInCone_Pillbox(float3 dir, float half_angle, unsigned int ray_number) {
+    __device__ float3 sampleRayDirectionInCone_Pillbox(float3 dir, float half_angle, unsigned int ray_number)
+    {
         curandState rng_state = params.rng_states[ray_number];
 
         const float half_angle_mrad = half_angle;
@@ -91,10 +95,11 @@ namespace OptixCSP {
         return normalize(sin_t * (cosf(phi) * u + sinf(phi) * v) + cos_t * w);
     }
 
-    __device__ float3 sampleRayDirectionInCone_Gaussian(float3 dir, float sigma, unsigned int ray_number) {
+    __device__ float3 sampleRayDirectionInCone_Gaussian(float3 dir, float sigma, unsigned int ray_number)
+    {
         curandState rng = params.rng_states[ray_number];
 
-        const float sigma_rad = sigma * 0.001f;   // Convert to rad
+        const float sigma_rad = sigma * 0.001f; // Convert to rad
 
         // Build an orthonormal basis
         float3 w = normalize(dir);
@@ -119,7 +124,7 @@ namespace OptixCSP {
     {
         curandState rng = params.rng_states[ray_number];
 
-        const float max_angle_mrad = params.sun_max_angle;      // [mrad]
+        const float max_angle_mrad = params.sun_max_angle; // [mrad]
 
         // Orthonormal basis about dir
         float3 w = normalize(dir);
@@ -141,7 +146,7 @@ namespace OptixCSP {
             theta2 = thetax * thetax + thetay * thetay;
             theta = sqrtf(theta2);
 
-            if (theta <= 4.65f)  // within solar disc (mrad, as in CPU code)
+            if (theta <= 4.65f) // within solar disc (mrad, as in CPU code)
             {
                 // stest = cos(0.326 * theta) / cos(0.308 * theta);
                 float t = theta;
@@ -168,8 +173,7 @@ namespace OptixCSP {
         float3 local_dir = make_float3(
             sin_t * cosf(phi),
             sin_t * sinf(phi),
-            cos_t
-        );
+            cos_t);
 
         params.rng_states[ray_number] = rng;
 
@@ -216,8 +220,7 @@ namespace OptixCSP {
         float3 local_dir = make_float3(
             sin_t * cosf(phi),
             sin_t * sinf(phi),
-            cos_t
-        );
+            cos_t);
 
         params.rng_states[ray_number] = rng;
 
@@ -226,8 +229,8 @@ namespace OptixCSP {
         return world_dir;
     }
 
-    __device__ float3 sampleRayDirectionInCone_UserDefined(float3 dir, int user_capacity, float* user_angle, 
-        float* user_intensity, unsigned int ray_number)
+    __device__ float3 sampleRayDirectionInCone_UserDefined(float3 dir, int user_capacity, float *user_angle,
+                                                           float *user_intensity, unsigned int ray_number)
     {
         curandState rng = params.rng_states[ray_number];
 
@@ -236,7 +239,7 @@ namespace OptixCSP {
             return normalize(dir);
         }
 
-        const float max_angle_mrad = params.sun_max_angle;  // [mrad]
+        const float max_angle_mrad = params.sun_max_angle; // [mrad]
         const float max_int = params.sun_max_intensity;
 
         // Orthonormal basis about dir
@@ -268,8 +271,7 @@ namespace OptixCSP {
                     stest = user_intensity[i];
                 else
                 {
-                    stest = user_intensity[i - 1] + (user_intensity[i] - user_intensity[i - 1]) * (theta - user_angle[i - 1])
-                        / denom;
+                    stest = user_intensity[i - 1] + (user_intensity[i] - user_intensity[i - 1]) * (theta - user_angle[i - 1]) / denom;
                 }
             }
 
@@ -287,8 +289,7 @@ namespace OptixCSP {
         float3 local_dir = make_float3(
             sin_t * cosf(phi),
             sin_t * sinf(phi),
-            cos_t
-        );
+            cos_t);
 
         params.rng_states[ray_number] = rng;
 
@@ -302,22 +303,22 @@ namespace OptixCSP {
 extern "C" __global__ void __raygen__sun_source()
 {
     // Lookup location in launch grid
-    const uint3 launch_idx = optixGetLaunchIndex();         // Index of the current launch thread
-    const uint3 launch_dims = optixGetLaunchDimensions();   // Dimensions of the launch grid
-    const unsigned int ray_number = launch_idx.y * launch_dims.x + launch_idx.x;  // Unique ray ID
-    const unsigned int ray_number_global = ray_number + params.ray_offset;  // Global unique ray ID
+    const uint3 launch_idx = optixGetLaunchIndex();                              // Index of the current launch thread
+    const uint3 launch_dims = optixGetLaunchDimensions();                        // Dimensions of the launch grid
+    const unsigned int ray_number = launch_idx.y * launch_dims.x + launch_idx.x; // Unique ray ID
+    const unsigned long long ray_number_global = ray_number + params.ray_offset; // Global unique ray ID
 
     float3 sun_sample_pos;
     switch (params.sun_gen_type)
     {
-        case(OptixCSP::GenType::RANDOM):
-            sun_sample_pos = OptixCSP::randomSampleInParallelogram(ray_number);
-            break;
-        case(OptixCSP::GenType::HALTON):
-            sun_sample_pos = OptixCSP::haltonSampleInParallelogram(ray_number_global);
-            break;
-        default:          
-            return;
+    case (OptixCSP::GenType::RANDOM):
+        sun_sample_pos = OptixCSP::randomSampleInParallelogram(ray_number);
+        break;
+    case (OptixCSP::GenType::HALTON):
+        sun_sample_pos = OptixCSP::haltonSampleInParallelogram(ray_number_global);
+        break;
+    default:
+        return;
     }
 
     // Sample emission angle here - capturing sun distribution
@@ -331,47 +332,44 @@ extern "C" __global__ void __raygen__sun_source()
     {
         switch (params.sun_shape)
         {
-            case(OptixCSP::SunShape::PILLBOX):
-                ray_dir = OptixCSP::sampleRayDirectionInCone_Pillbox(init_ray_dir, params.half_width, ray_number);
-                break;
-            case(OptixCSP::SunShape::GAUSSIAN):
-                ray_dir = OptixCSP::sampleRayDirectionInCone_Gaussian(init_ray_dir, params.sigma, ray_number);
-                break;
-            case(OptixCSP::SunShape::BUIE_CSR):
-                ray_dir = OptixCSP::sampleRayDirectionInCone_BuieCSR(init_ray_dir, params.buie_kappa, params.buie_gamma, ray_number);
-                break;
-            case(OptixCSP::SunShape::LIMBDARKENED):
-                ray_dir = OptixCSP::sampleRayDirectionInCone_LimbDarkened(init_ray_dir, ray_number);
-                break;
-            case(OptixCSP::SunShape::USER_DEFINED):
-                ray_dir = OptixCSP::sampleRayDirectionInCone_UserDefined(init_ray_dir, params.sun_user_capacity, 
-                    params.sun_user_angle, params.sun_user_intensity, ray_number);
-                break;
-            default:
-                assert(false);
-                // Just return since the sun shape is not supported
-                return;
+        case (OptixCSP::SunShape::PILLBOX):
+            ray_dir = OptixCSP::sampleRayDirectionInCone_Pillbox(init_ray_dir, params.half_width, ray_number);
+            break;
+        case (OptixCSP::SunShape::GAUSSIAN):
+            ray_dir = OptixCSP::sampleRayDirectionInCone_Gaussian(init_ray_dir, params.sigma, ray_number);
+            break;
+        case (OptixCSP::SunShape::BUIE_CSR):
+            ray_dir = OptixCSP::sampleRayDirectionInCone_BuieCSR(init_ray_dir, params.buie_kappa, params.buie_gamma, ray_number);
+            break;
+        case (OptixCSP::SunShape::LIMBDARKENED):
+            ray_dir = OptixCSP::sampleRayDirectionInCone_LimbDarkened(init_ray_dir, ray_number);
+            break;
+        case (OptixCSP::SunShape::USER_DEFINED):
+            ray_dir = OptixCSP::sampleRayDirectionInCone_UserDefined(init_ray_dir, params.sun_user_capacity,
+                                                                     params.sun_user_angle, params.sun_user_intensity, ray_number);
+            break;
+        default:
+            assert(false);
+            // Just return since the sun shape is not supported
+            return;
         }
     }
     else
     {
         ray_dir = init_ray_dir;
     }
-    
-    
-    //float3 ray_dir = OptixCSP::sampleRayDirectionInCone_Gaussian(init_ray_dir, params.max_sun_angle, ray_number);
+
+    // float3 ray_dir = OptixCSP::sampleRayDirectionInCone_Gaussian(init_ray_dir, params.max_sun_angle, ray_number);
 
     // Create the PerRayData structure to track ray state (e.g., path index and recursion depth)
     OptixCSP::PerRayData prd;
     prd.ray_path_index = ray_number;
     prd.depth = 0;
 
-    // TODO make this a launch parameter
-    params.hit_point_buffer[params.max_depth * prd.ray_path_index] = make_float4(0.0f, ray_gen_pos);
-    params.element_id_buffer[params.max_depth * prd.ray_path_index] = OptixCSP::kElementIdRayGen;
-    params.hit_type_buffer[params.max_depth * prd.ray_path_index] = OptixCSP::HitType::HIT_CREATE;
+    params.hit_buffer[params.max_depth * prd.ray_path_index].hit_point = make_float4(0.0f, ray_gen_pos);
+    params.hit_buffer[params.max_depth * prd.ray_path_index].element_id = OptixCSP::kElementIdRayGen;
+    params.hit_buffer[params.max_depth * prd.ray_path_index].hit_type = OptixCSP::HitType::HIT_CREATE;
     params.sun_dir_buffer[prd.ray_path_index] = ray_dir;
-    
 
     // Cast and trace the ray through the scene
     optixTrace(
@@ -386,7 +384,6 @@ extern "C" __global__ void __raygen__sun_source()
         OptixCSP::RAY_TYPE_RADIANCE, // Ray type (radiance for sunlight)
         OptixCSP::RAY_TYPE_COUNT,    // Number of ray types
         OptixCSP::RAY_TYPE_RADIANCE, // SBT offset (ray type to launch)
-        reinterpret_cast<unsigned int&>(prd.ray_path_index),
-        reinterpret_cast<unsigned int&>(prd.depth)  
-    );
-}
\ No newline at end of file
+        reinterpret_cast<unsigned int &>(prd.ray_path_index),
+        reinterpret_cast<unsigned int &>(prd.depth));
+}
diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/utils/util_check.hpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/utils/util_check.hpp
index c9247570..e8927199 100644
--- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/utils/util_check.hpp
+++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/utils/util_check.hpp
@@ -83,7 +83,7 @@ namespace OptixCSP {
                 << std::string(log, log + log_size);
             throw std::runtime_error(oss.str());
         }
-        else if (log_size > 1)
+        else if (log_size > 1 && log != nullptr && log[0] != '\0')
         {
             std::cerr << "OptiX log for " << func << ":\n"
                 << std::string(log, log + log_size) << std::endl;
diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp
index be7f6979..532638a6 100644
--- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp
+++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp
@@ -1,8 +1,7 @@
 #include "simulation_runner/optix_runner/optix_runner.hpp"
-#include "simulation_data/simulation_parameters.hpp"
-#include "simulation_data/simulation_data.hpp"
 #include "simulation_data/simulation_data_export.hpp"
 
+#include <iostream>
 #include <sstream>
 #include <stdexcept>
 
@@ -20,6 +19,46 @@ OptixRunner::~OptixRunner()
     this->m_sys.clean_up();
 }
 
+void OptixRunner::set_verbose(bool verbose)
+{
+    m_sys.set_verbose(verbose);
+}
+
+void OptixRunner::print_timing() const
+{
+    m_sys.print_timing();
+}
+
+void OptixRunner::set_max_ray_depth(uint_fast64_t depth)
+{
+    m_sys.set_max_ray_depth(depth);
+}
+
+void OptixRunner::set_batch_size(uint_fast64_t batch_size)
+{
+    m_sys.set_batch_size(batch_size);
+}
+
+uint_fast64_t OptixRunner::get_batch_size() const
+{
+    return m_sys.get_batch_size();
+}
+
+void OptixRunner::set_trim_excess_rays(bool trim)
+{
+    m_sys.set_trim_excess_rays(trim);
+}
+
+bool OptixRunner::get_trim_excess_rays() const
+{
+    return m_sys.get_trim_excess_rays();
+}
+
+uint64_t OptixRunner::get_N_run_iterations() const
+{
+    return m_sys.get_N_run_iterations();
+}
+
 RunnerStatus OptixRunner::initialize()
 {
     // add elements to sys using data structure from SimulationData
@@ -51,8 +90,6 @@ RunnerStatus OptixRunner::setup_simulation(const SimulationData *data)
 
     m_sys.initialize();
 
-    
-
     // std::cout << "Number of stages: " << this->tsys.StageList.size()
     //           << std::endl;
 
@@ -63,6 +100,7 @@ RunnerStatus OptixRunner::setup_parameters(const SimulationData *data)
 {
     // Get Parameter data
     const SimulationParameters &sim_params = data->get_simulation_parameters();
+
     m_sys.set_number_of_rays(sim_params.number_of_rays, sim_params.max_number_of_rays);
     m_sys.set_seed(static_cast<uint64_t>(sim_params.seed));
 
@@ -106,6 +144,17 @@ RunnerStatus OptixRunner::setup_sun(const SimulationData *data)
         }
     }
 
+    // Warn if Halton sampling is used with more rays than uint32_t can index,
+    // since the Halton sequence index is truncated to 32 bits causing repeated positions.
+    if (sun->get_gen_type() == SolTrace::Data::GenType::HALTON &&
+        data->get_simulation_parameters().max_number_of_rays > static_cast<uint_fast64_t>(std::numeric_limits<uint32_t>::max()))
+    {
+        std::cerr << "Warning: max_number_of_rays exceeds 32-bit unsigned int maximum ("
+                  << std::numeric_limits<uint32_t>::max()
+                  << ") with Halton ray generation. Halton sequence positions will repeat after index "
+                  << std::numeric_limits<uint32_t>::max() << "." << std::endl;
+    }
+
     return RunnerStatus::SUCCESS;
 }
 
@@ -124,8 +173,10 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data)
 
             auto optix_el = std::make_shared<OptixCSP::CspElement>();
             auto origin = el->get_origin_global();
-            OptixCSP::Vec3d origin_vec(origin.x, origin.y, origin.z);
+            auto ap = el->get_aim_vector_global();
+            // OptixCSP::Vec3d origin_vec(origin.x, origin.y, origin.z);
             optix_el->set_origin(ToVec3d(origin));
+            optix_el->set_aim_point(ToVec3d(ap));
             optix_el->set_rotation_matrix(ToMatrix33d(el->get_local_to_global()));
 
             // Safely narrow element id to int32_t
@@ -136,8 +187,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data)
             }
             optix_el->set_id(static_cast<int32_t>(id));
 
-            // TODO: check zrot, radiance or degree here?
-
             // Add optical properties
             OpticalProperties *opt_front = el->get_front_optical_properties();
             OptixCSP::OpticalDistribution od = this->to_optical_distribution(opt_front->error_distribution_type);
@@ -214,7 +263,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data)
                 }
 
                 auto surface = std::make_shared<OptixCSP::SurfaceCylinder>();
-                // surface->set_half_height(2.); // TODO this needs to come from the aperture
                 surface->set_half_height(0.5 * el_aperture->y_length());
                 surface->set_radius(el_surface->radius);
                 optix_el->set_surface(surface);
@@ -228,22 +276,39 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data)
             }
 
             auto soltrace_aperture_type = el->get_aperture()->get_type();
-            
+
             switch (soltrace_aperture_type)
             {
-
             case ApertureType::RECTANGLE:
             {
-
                 auto el_aperture = std::dynamic_pointer_cast<Rectangle>(el->get_aperture());
                 assert(el_aperture != nullptr);
                 // TODO: account for x and y coord?
-                auto aperture = std::make_shared<OptixCSP::ApertureRectangle>(el_aperture->x_length(), el_aperture->y_length(),
-                    el_aperture->x_coord(), el_aperture->y_coord());
+                // auto aperture = std::make_shared<OptixCSP::ApertureRectangle>(el_aperture->x_length(),
+                // 							      el_aperture->y_length());
+                auto aperture = std::make_shared<OptixCSP::ApertureRectangle>(el_aperture->x_length(),
+                                                                              el_aperture->y_length(),
+                                                                              el_aperture->x_coord(),
+                                                                              el_aperture->y_coord());
+                optix_el->set_aperture(aperture);
+                break;
+            }
+            case ApertureType::ANNULUS:
+            {
+                auto el_aperture = std::dynamic_pointer_cast<Annulus>(el->get_aperture());
+                assert(el_aperture != nullptr);
+                auto aperture = std::make_shared<OptixCSP::ApertureAnnulus>(el_aperture->inner_radius, el_aperture->outer_radius, el_aperture->arc_angle * D2R);
+                optix_el->set_aperture(aperture);
+                break;
+            }
+            case ApertureType::CIRCLE:
+            {
+                auto el_aperture = std::dynamic_pointer_cast<Circle>(el->get_aperture());
+                assert(el_aperture != nullptr);
+                auto aperture = std::make_shared<OptixCSP::ApertureCircle>(0.5 * el_aperture->diameter);
                 optix_el->set_aperture(aperture);
                 break;
             }
-
             case ApertureType::EQUILATERAL_TRIANGLE:
             {
                 auto el_aperture = std::dynamic_pointer_cast<EquilateralTriangle>(el->get_aperture());
@@ -259,7 +324,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data)
 
                 break;
             }
-
             case ApertureType::IRREGULAR_TRIANGLE:
             {
                 auto el_aperture = std::dynamic_pointer_cast<IrregularTriangle>(el->get_aperture());
@@ -274,7 +338,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data)
 
                 break;
             }
-
             case ApertureType::IRREGULAR_QUADRILATERAL:
             {
                 auto el_aperture = std::dynamic_pointer_cast<IrregularQuadrilateral>(el->get_aperture());
@@ -290,11 +353,18 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data)
 
                 break;
             }
-
+            case ApertureType::HEXAGON:
+            {
+                auto el_aperture = std::dynamic_pointer_cast<Hexagon>(el->get_aperture());
+                assert(el_aperture != nullptr);
+                auto aperture = std::make_shared<OptixCSP::ApertureHexagon>(el_aperture->radius_circumscribed_circle());
+                optix_el->set_aperture(aperture);
+                break;
+            }
             default:
                 // std::cerr << "Unsupported aperture type in OptixCSP" << std::endl;
-	      throw std::runtime_error("Unsupported aperture type in OptixRunner");
-	      break;
+                throw std::runtime_error("Unsupported aperture type in OptixRunner");
+                break;
             }
 
             double xmin, xmax, ymin, ymax, zmin, zmax;
@@ -323,9 +393,8 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data)
 
 RunnerStatus OptixRunner::update_simulation(const SimulationData *data)
 {
-    this->setup_simulation(data);
-    // TODO: Implement this
-    return RunnerStatus::SUCCESS;
+    return this->setup_simulation(data);
+    // TODO: Implement this in a less lazy manner...
 }
 
 RunnerStatus OptixRunner::run_simulation()
@@ -374,6 +443,8 @@ RunnerStatus OptixRunner::report_simulation(SimulationResult *result,
     std::map<unsigned int, SolTrace::Result::ray_record_ptr> ray_records;
     std::map<unsigned int, SolTrace::Result::ray_record_ptr>::iterator iter;
 
+    // TODO: This should be redone without using these vectors and just using the
+    // internal hit record vector
     // Get results from optixcsp
     std::vector<float4> hp_vec;
     std::vector<uint_fast64_t> raynumber_vec;
@@ -390,7 +461,7 @@ RunnerStatus OptixRunner::report_simulation(SimulationResult *result,
     // Loop through data, populating ray records
     // Assumes ray data is grouped serially
     size_t ndata = hp_vec.size();
-    uint_fast64_t raynum_prev = -1;
+    // uint_fast64_t raynum_prev = -1;
     uint_fast64_t raynum = 0;
     SolTrace::Result::ray_record_ptr rec = nullptr;
     SolTrace::Result::interaction_ptr intr = nullptr;
@@ -399,7 +470,7 @@ RunnerStatus OptixRunner::report_simulation(SimulationResult *result,
         // Collect results for record
         raynum = raynumber_vec[ii];
         glm::dvec3 pos(hp_vec[ii].y, hp_vec[ii].z, hp_vec[ii].w); // x is depth
-        glm::dvec3 cos(0.0);                                  // TODO: calculate directions
+        glm::dvec3 cos(0.0);                                      // TODO: calculate directions
         int32_t element_id = element_id_vec[ii];
         uint8_t hit_type = hit_type_vec[ii];
         SolTrace::Result::RayEvent rev = hit_type_to_ray_event(static_cast<OptixCSP::HitType>(hit_type));
@@ -446,7 +517,7 @@ OptixCSP::Vec3d OptixRunner::ToVec3d(glm::dvec3 v)
     return vec;
 }
 
-OptixCSP::Matrix33d OptixRunner::ToMatrix33d(const glm::dmat3& mat)
+OptixCSP::Matrix33d OptixRunner::ToMatrix33d(const glm::dmat3 &mat)
 {
     return OptixCSP::Matrix33d(
         mat[0][0], mat[1][0], mat[2][0],
diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.hpp b/coretrace/simulation_runner/optix_runner/optix_runner.hpp
index 7f4054dc..98c87c60 100644
--- a/coretrace/simulation_runner/optix_runner/optix_runner.hpp
+++ b/coretrace/simulation_runner/optix_runner/optix_runner.hpp
@@ -30,9 +30,38 @@ class OptixRunner : public SolTrace::Runner::SimulationRunner
     SolTrace::Runner::RunnerStatus get_hp_output(std::vector<float4>& hp_vec,
         std::vector<uint_fast64_t>& raynumber_vec, std::vector<int32_t>& element_id_vec);
 
-    double get_sun_plane_area() { return m_sys.get_sun_plane_area(); }
+    double get_sun_plane_area() const { return m_sys.get_sun_plane_area(); }
 
-    uint_fast64_t get_N_sun_rays() { return m_sys.get_N_sun_rays(); }
+    uint_fast64_t get_N_sun_rays() const { return m_sys.get_N_sun_rays(); }
+    inline uint_fast64_t get_number_rays_launched() const override {return get_N_sun_rays(); }
+    inline uint_fast64_t get_number_rays_traced() const override {return m_sys.get_N_hit_rays(); }
+
+    uint64_t get_N_run_iterations() const;
+
+    void print_timing() const;
+
+    void set_verbose(bool verbose);
+
+    // Set the number of rays to launch for a trace in each optixLaunch call.
+    // WARNING: The runner is forced to use this batch size regardless of available GPU memory!!!!
+    // Setting a large batch size can cause device out of memory errors or degraded GPU performance.
+    // Setting a small batch size can cause long run times. Care is required when using this function.
+    void set_batch_size(uint_fast64_t batch_size);
+    uint_fast64_t get_batch_size() const;
+
+    /// Set the maximum ray interaction depth. Must be called before initialize().
+    /// Depth is clamped to [2, 255] with a warning if either bound is exceeded. Defaults to DEFAULT_MAX_TRACE_DEPTH.
+    void set_max_ray_depth(uint_fast64_t depth);
+    uint8_t get_max_ray_depth() const { return m_sys.get_max_ray_depth(); }
+
+    /// Returns the number of rays terminated by reaching max_depth without being absorbed.
+    /// Valid after run_simulation() completes; resets to 0 at the start of each run.
+    uint_fast64_t get_N_depth_exceeded_rays() const { return m_sys.get_N_depth_exceeded_rays(); }
+
+    /// Enable or disable trimming of excess rays at the end of run() so that
+    /// exactly the requested number of hit rays is returned.  Enabled by default.
+    void set_trim_excess_rays(bool trim);
+    bool get_trim_excess_rays() const;
 
     // Runner options
     // void disable_sun_shape_errors() { this->include_sun_shape_errors = false; }
diff --git a/coretrace/simulation_runner/simulation_runner.hpp b/coretrace/simulation_runner/simulation_runner.hpp
index 91c2c7e2..e8e21ba7 100644
--- a/coretrace/simulation_runner/simulation_runner.hpp
+++ b/coretrace/simulation_runner/simulation_runner.hpp
@@ -64,6 +64,9 @@ namespace SolTrace::Runner
         virtual RunnerStatus report_simulation(SolTrace::Result::SimulationResult *result,
                                                int level_spec) = 0;
 
+        virtual uint_fast64_t get_number_rays_launched() const = 0;
+        virtual uint_fast64_t get_number_rays_traced() const = 0;
+
     private:
     };
 
diff --git a/google-tests/unit-tests/simulation_data/aperture_test.cpp b/google-tests/unit-tests/simulation_data/aperture_test.cpp
index 3ee7544e..ddf0f642 100644
--- a/google-tests/unit-tests/simulation_data/aperture_test.cpp
+++ b/google-tests/unit-tests/simulation_data/aperture_test.cpp
@@ -334,7 +334,7 @@ TEST(Aperture, Hexagon)
     const double TOL = 1e-12;
     const double D = 2.0;
     const double R = 0.5 * D;
-    const double S = sqrt(3.0) * R; // Side length of hexagon
+    // const double S = sqrt(3.0) * R; // Side length of hexagon
     const double AREA = 0.5 * sqrt(27.0) * R * R;
 
     const double X1 = 1.0;
diff --git a/google-tests/unit-tests/simulation_runner/embree_runner/embree_runner_multithreading_test.cpp b/google-tests/unit-tests/simulation_runner/embree_runner/embree_runner_multithreading_test.cpp
index b735fae7..c70054cf 100644
--- a/google-tests/unit-tests/simulation_runner/embree_runner/embree_runner_multithreading_test.cpp
+++ b/google-tests/unit-tests/simulation_runner/embree_runner/embree_runner_multithreading_test.cpp
@@ -73,7 +73,7 @@ TEST(EmbreeRunner, CancelMultithread)
     auto fsts = std::async(&EmbreeRunner::run_simulation, &runner);
 
     // Give time to start processing
-    std::this_thread::sleep_for(std::chrono::milliseconds(250));
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
     sts = runner.status_simulation();
     EXPECT_EQ(sts, RunnerStatus::RUNNING);
 
diff --git a/google-tests/unit-tests/simulation_runner/native_runner/native_runner_test.cpp b/google-tests/unit-tests/simulation_runner/native_runner/native_runner_test.cpp
index 9dda077d..c23f0e9b 100644
--- a/google-tests/unit-tests/simulation_runner/native_runner/native_runner_test.cpp
+++ b/google-tests/unit-tests/simulation_runner/native_runner/native_runner_test.cpp
@@ -236,6 +236,59 @@ TEST(NativeRunner, SmokeTest)
               << std::endl;
 }
 
+TEST(NativeRunner, RaysLaunchedEqualsRequestedAfterRun)
+{
+    const uint_fast64_t NRAYS = 10;
+    NativeRunner runner;
+    SimulationData my_sim;
+
+    SimulationParameters &params = my_sim.get_simulation_parameters();
+    params.include_optical_errors = false;
+    params.include_sun_shape_errors = false;
+    params.number_of_rays = NRAYS;
+    params.max_number_of_rays = 10 * NRAYS;
+
+    auto sun = SolTrace::Data::make_ray_source<Sun>();
+    sun->set_position(0.0, 0.0, 100.0);
+    sun->set_shape(SolTrace::Data::SunShape::GAUSSIAN, 1.0, -5.0, 0.0);
+    my_sim.add_ray_source(sun);
+
+    auto my_st = SolTrace::Data::make_stage(0);
+    const int NUM_ELEMENTS = 4;
+    double x[NUM_ELEMENTS] = {1.0, 0.0, -1.0, 0.0};
+    double y[NUM_ELEMENTS] = {0.0, 1.0, 0.0, -1.0};
+    OpticalProperties optics(SolTrace::Data::InteractionType::REFLECTION,
+                             SolTrace::Data::DistributionType::GAUSSIAN,
+                             0.0, 1.0, 0.0, 0.0, 1.0, 1.0);
+    for (int k = 0; k < NUM_ELEMENTS; ++k)
+    {
+        element_ptr el = SolTrace::Data::make_element<SingleElement>();
+        el->set_aperture(SolTrace::Data::make_aperture<Circle>(2.0));
+        el->set_surface(SolTrace::Data::make_surface<Flat>());
+        el->set_reference_frame_geometry(glm::dvec3(x[k], y[k], 0.0),
+                                         glm::dvec3(-x[k], -y[k], 1.0),
+                                         0.0);
+        el->set_front_optical_properties(optics);
+        el->set_back_optical_properties(optics);
+        my_st->add_element(el);
+    }
+    my_sim.add_stage(my_st);
+
+    RunnerStatus sts = runner.initialize();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.setup_simulation(&my_sim);
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+
+    // Before running, SunRayCount has not been accumulated yet
+    EXPECT_EQ(runner.get_number_rays_launched(), static_cast<uint_fast64_t>(0));
+
+    sts = runner.run_simulation();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+
+    EXPECT_GE(runner.get_number_rays_launched(), NRAYS);
+    EXPECT_EQ(runner.get_number_rays_traced(), NRAYS);
+}
+
 TEST(NativeRunner, PowerTowerSmokeTest)
 {
     SimulationData sd;
diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt b/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt
index 3103ba0e..9df394d1 100644
--- a/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt
+++ b/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt
@@ -20,6 +20,9 @@ set(OPTIX_RUNNER_TEST_SRC
     flat_optical_test.cpp
     two_plate_test.cpp
     sun_test.cpp
+    ray_position_sampling_test.cpp
+    batch_size_test.cpp
+    max_ray_depth_test.cpp
 )
 
 add_executable(OptixRunnerUnitTests
diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp
new file mode 100644
index 00000000..7bd5d058
--- /dev/null
+++ b/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp
@@ -0,0 +1,185 @@
+#include <gtest/gtest.h>
+
+#include <limits>
+#include <stdexcept>
+
+#include <optix_runner.hpp>
+#include <simulation_data.hpp>
+#include <simulation_data_export.hpp>
+#include <simulation_result_export.hpp>
+
+using SolTrace::Runner::RunnerStatus;
+
+// Reuse the two-plate scene defined in two_plate_test.cpp
+void make_two_plate_sd(SimulationData &sd, element_ptr &plate1, element_ptr &plate2);
+
+// ---------------------------------------------------------------------------
+// set_batch_size / get_batch_size accessor tests (no GPU required)
+// ---------------------------------------------------------------------------
+
+// Default value of 0 means automatic batch sizing: determine_batch_size() will
+// call automatic_batch_size() to compute a GPU-memory-safe batch size at run
+// time. It does NOT mean "launch all rays in a single batch".
+TEST(OptixRunnerBatchSize, DefaultIsZeroMeansAutoSize)
+{
+    OptixRunner runner;
+    EXPECT_EQ(runner.get_batch_size(), 0u);
+}
+
+TEST(OptixRunnerBatchSize, SetAndGet)
+{
+    OptixRunner runner;
+    runner.set_batch_size(500);
+    EXPECT_EQ(runner.get_batch_size(), 500u);
+}
+
+// Setting back to 0 restores automatic GPU-memory-based sizing.
+TEST(OptixRunnerBatchSize, SetZeroRestoresAutoSize)
+{
+    OptixRunner runner;
+    runner.set_batch_size(1000);
+    runner.set_batch_size(0);
+    EXPECT_EQ(runner.get_batch_size(), 0u);
+}
+
+TEST(OptixRunnerBatchSize, ThrowsOnOverflow)
+{
+    OptixRunner runner;
+    const uint_fast64_t too_large =
+        static_cast<uint_fast64_t>(std::numeric_limits<int>::max()) + 1ULL;
+    EXPECT_THROW(runner.set_batch_size(too_large), std::out_of_range);
+    // Value should be unchanged after the throw
+    EXPECT_EQ(runner.get_batch_size(), 0u);
+}
+
+// INT_MAX itself is the largest valid batch size; it must not throw.
+TEST(OptixRunnerBatchSize, MaxIntBoundaryDoesNotThrow)
+{
+    OptixRunner runner;
+    const uint_fast64_t max_valid =
+        static_cast<uint_fast64_t>(std::numeric_limits<int>::max());
+    EXPECT_NO_THROW(runner.set_batch_size(max_valid));
+    EXPECT_EQ(runner.get_batch_size(), max_valid);
+}
+
+// A failed set_batch_size must not corrupt a previously stored non-zero value.
+TEST(OptixRunnerBatchSize, ThrowPreservesExistingValue)
+{
+    OptixRunner runner;
+    runner.set_batch_size(999);
+    const uint_fast64_t too_large =
+        static_cast<uint_fast64_t>(std::numeric_limits<int>::max()) + 1ULL;
+    EXPECT_THROW(runner.set_batch_size(too_large), std::out_of_range);
+    EXPECT_EQ(runner.get_batch_size(), 999u);
+}
+
+// ---------------------------------------------------------------------------
+// Simulation correctness: batched run should yield the same hit count as the
+// default single-batch run.
+// ---------------------------------------------------------------------------
+
+TEST(OptixRunnerBatchSize, BatchedRunMatchesHitCount)
+{
+    const int N_rays = 10000;
+
+    // --- reference run (default auto-sized batch: m_batch_size == 0 defers to
+    //     determine_batch_size() / automatic_batch_size()) ---
+    SimulationData sd_ref;
+    element_ptr p1_ref, p2_ref;
+    make_two_plate_sd(sd_ref, p1_ref, p2_ref);
+    sd_ref.get_simulation_parameters().number_of_rays = N_rays;
+    sd_ref.get_simulation_parameters().max_number_of_rays = N_rays * 100;
+
+    OptixRunner ref_runner;
+    ASSERT_EQ(ref_runner.initialize(), RunnerStatus::SUCCESS);
+    ASSERT_EQ(ref_runner.setup_simulation(&sd_ref), RunnerStatus::SUCCESS);
+    ASSERT_EQ(ref_runner.run_simulation(), RunnerStatus::SUCCESS);
+
+    SimulationResult ref_result;
+    ASSERT_EQ(ref_runner.report_simulation(&ref_result, 0), RunnerStatus::SUCCESS);
+    const int ref_hits = ref_result.get_number_of_records();
+
+    // --- explicit batched run (user-supplied batch_size = 1000, ~10 iterations) ---
+    SimulationData sd_batch;
+    element_ptr p1_batch, p2_batch;
+    make_two_plate_sd(sd_batch, p1_batch, p2_batch);
+    sd_batch.get_simulation_parameters().number_of_rays = N_rays;
+    sd_batch.get_simulation_parameters().max_number_of_rays = N_rays * 100;
+
+    OptixRunner batch_runner;
+    batch_runner.set_batch_size(1000);
+    ASSERT_EQ(batch_runner.initialize(), RunnerStatus::SUCCESS);
+    ASSERT_EQ(batch_runner.setup_simulation(&sd_batch), RunnerStatus::SUCCESS);
+    ASSERT_EQ(batch_runner.run_simulation(), RunnerStatus::SUCCESS);
+
+    SimulationResult batch_result;
+    ASSERT_EQ(batch_runner.report_simulation(&batch_result, 0), RunnerStatus::SUCCESS);
+    const int batch_hits = batch_result.get_number_of_records();
+
+    EXPECT_EQ(ref_hits, N_rays);
+    EXPECT_EQ(batch_hits, N_rays);
+
+    // With a user-supplied batch of 1000 and 10000 rays, at least 10 iterations
+    // are required regardless of available GPU memory.
+    EXPECT_GE(batch_runner.get_N_run_iterations(), 10u);
+}
+
+// ---------------------------------------------------------------------------
+// Batch size smaller than total rays forces multiple iterations.
+// ---------------------------------------------------------------------------
+
+TEST(OptixRunnerBatchSize, SmallBatchMultipleIterations)
+{
+    const int N_rays = 5000;
+    const int batch = 500; // 10+ iterations needed
+
+    SimulationData sd;
+    element_ptr plate1, plate2;
+    make_two_plate_sd(sd, plate1, plate2);
+    sd.get_simulation_parameters().number_of_rays = N_rays;
+    sd.get_simulation_parameters().max_number_of_rays = N_rays * 100;
+
+    OptixRunner runner;
+    runner.set_batch_size(batch);
+    ASSERT_EQ(runner.initialize(), RunnerStatus::SUCCESS);
+    ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS);
+    ASSERT_EQ(runner.run_simulation(), RunnerStatus::SUCCESS);
+
+    SimulationResult result;
+    ASSERT_EQ(runner.report_simulation(&result, 0), RunnerStatus::SUCCESS);
+
+    EXPECT_EQ(result.get_number_of_records(), N_rays);
+
+    // With a small batch the runner must have generated at least N_rays sun rays
+    EXPECT_GE(runner.get_N_sun_rays(), static_cast<uint_fast64_t>(N_rays));
+
+    // Multiple iterations must have been needed
+    EXPECT_GE(runner.get_N_run_iterations(), 10u);
+}
+
+// ---------------------------------------------------------------------------
+// Batch size >= N_rays: should complete in exactly one iteration.
+// ---------------------------------------------------------------------------
+
+TEST(OptixRunnerBatchSize, BatchSizeExceedingRaysCompletesInOneIteration)
+{
+    const int N_rays = 1000;
+
+    SimulationData sd;
+    element_ptr plate1, plate2;
+    make_two_plate_sd(sd, plate1, plate2);
+    sd.get_simulation_parameters().number_of_rays = N_rays;
+    sd.get_simulation_parameters().max_number_of_rays = N_rays * 100;
+
+    OptixRunner runner;
+    runner.set_batch_size(N_rays * 8); // larger than N_rays
+    ASSERT_EQ(runner.initialize(), RunnerStatus::SUCCESS);
+    ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS);
+    ASSERT_EQ(runner.run_simulation(), RunnerStatus::SUCCESS);
+
+    SimulationResult result;
+    ASSERT_EQ(runner.report_simulation(&result, 0), RunnerStatus::SUCCESS);
+
+    EXPECT_EQ(result.get_number_of_records(), N_rays);
+    EXPECT_EQ(runner.get_N_run_iterations(), 1u);
+}
diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp
index 1f34016c..b1dd3f36 100644
--- a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp
+++ b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp
@@ -8,12 +8,13 @@
 using SolTrace::Runner::RunnerStatus;
 
 const double Z_ELEM = 50.0;
+const double Z_BACKSTOP = Z_ELEM - 0.5 * Z_ELEM;
 const double TOL = 1e-6;
 const uint_fast64_t NRAYS = 10000;
 
-void set_default_sd(SimulationData &sd,
-                    surface_ptr surf,
-                    aperture_ptr ap)
+element_id set_default_sd(SimulationData &sd,
+                          surface_ptr surf,
+                          aperture_ptr ap)
 {
     sd.clear();
 
@@ -22,7 +23,7 @@ void set_default_sd(SimulationData &sd,
     sun->set_position(0, 0, 100);
     sd.add_ray_source(sun);
 
-    // Make reflective flat el
+    // Make target element
     element_ptr el = make_element<SingleElement>();
     el->set_origin(0, 0, Z_ELEM);
     el->set_aim_vector(0, 0, 100); // Face up towards sun
@@ -36,7 +37,20 @@ void set_default_sd(SimulationData &sd,
     el->set_name("el");
 
     // Add element to stage
-    sd.add_element(el);
+    element_id id = sd.add_element(el);
+
+    // Back stop element that is bigger than the created element so that the
+    // testing element casts a shadow on this big thing.
+    element_ptr stop = make_element<SingleElement>();
+    double xlb, xub, ylb, yub;
+    ap->bounding_box(xlb, xub, ylb, yub);
+    const double sx = std::max(fabs(xlb), fabs(xub)) + 1.0;
+    const double sy = std::max(fabs(ylb), fabs(yub)) + 1.0;
+    stop->set_origin(0, 0, Z_BACKSTOP);
+    stop->set_aim_vector(0, 0, 100);
+    stop->set_surface(make_surface<Flat>());
+    stop->set_aperture(make_aperture<Rectangle>(sx, sy));
+    sd.add_element(stop);
 
     // Set parameters
     SimulationParameters &params = sd.get_simulation_parameters();
@@ -45,6 +59,8 @@ void set_default_sd(SimulationData &sd,
     params.include_optical_errors = false;
     params.include_sun_shape_errors = false;
     params.seed = 123;
+
+    return id;
 }
 
 TEST(OptixRunner, FlatRectangle)
@@ -55,7 +71,7 @@ TEST(OptixRunner, FlatRectangle)
     auto aper = make_aperture<Rectangle>(XL, YL);
 
     SimulationData sd;
-    set_default_sd(sd, surf, aper);
+    element_id test_elid = set_default_sd(sd, surf, aper);
     SimulationResult result;
 
     OptixRunner runner;
@@ -77,9 +93,24 @@ TEST(OptixRunner, FlatRectangle)
         glm::dvec3 p0, p1;
         rr->get_position(0, p0);
         rr->get_position(1, p1);
+        auto id = rr->get_element(1);
         EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i;
         EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i;
-        EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i;
+
+        if (id == test_elid)
+        {
+            // We hit the test element. Check that the height is as expected
+            EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i;
+            // And that we are in the aperture
+            EXPECT_TRUE(aper->is_in(p1[0], p1[1]));
+        }
+        else
+        {
+            // We hit the back stop element. Check that the height is as expected
+            EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM);
+            // And that we are not in the aperture.
+            EXPECT_FALSE(aper->is_in(p1[0], p1[1]));
+        }
     }
 }
 
@@ -90,7 +121,7 @@ TEST(OptixRunner, FlatEquilateralTriangle)
     auto aper = make_aperture<EquilateralTriangle>(d);
 
     SimulationData sd;
-    set_default_sd(sd, surf, aper);
+    element_id test_elid = set_default_sd(sd, surf, aper);
     SimulationResult result;
 
     OptixRunner runner;
@@ -112,9 +143,20 @@ TEST(OptixRunner, FlatEquilateralTriangle)
         glm::dvec3 p0, p1;
         rr->get_position(0, p0);
         rr->get_position(1, p1);
+        auto id = rr->get_element(1);
         EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i;
         EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i;
-        EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i;
+
+        if (id == test_elid)
+        {
+            EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i;
+            EXPECT_TRUE(aper->is_in(p1[0], p1[1]));
+        }
+        else
+        {
+            EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM);
+            EXPECT_FALSE(aper->is_in(p1[0], p1[1]));
+        }
     }
 }
 
@@ -126,7 +168,7 @@ TEST(OptixRunner, FlatTriangle)
     auto aper = make_aperture<IrregularTriangle>(x1, y1, x2, y2, x3, y3);
 
     SimulationData sd;
-    set_default_sd(sd, surf, aper);
+    element_id test_elid = set_default_sd(sd, surf, aper);
     SimulationResult result;
 
     OptixRunner runner;
@@ -148,9 +190,20 @@ TEST(OptixRunner, FlatTriangle)
         glm::dvec3 p0, p1;
         rr->get_position(0, p0);
         rr->get_position(1, p1);
+        auto id = rr->get_element(1);
         EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i;
         EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i;
-        EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i;
+
+        if (id == test_elid)
+        {
+            EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i;
+            EXPECT_TRUE(aper->is_in(p1[0], p1[1]));
+        }
+        else
+        {
+            EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM);
+            EXPECT_FALSE(aper->is_in(p1[0], p1[1]));
+        }
     }
 }
 
@@ -164,7 +217,7 @@ TEST(OptixRunner, FlatQuadrilateral)
         x1, y1, x2, y2, x3, y3, x4, y4);
 
     SimulationData sd;
-    set_default_sd(sd, surf, aper);
+    element_id test_elid = set_default_sd(sd, surf, aper);
     SimulationResult result;
 
     OptixRunner runner;
@@ -186,9 +239,20 @@ TEST(OptixRunner, FlatQuadrilateral)
         glm::dvec3 p0, p1;
         rr->get_position(0, p0);
         rr->get_position(1, p1);
+        auto id = rr->get_element(1);
         EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i;
         EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i;
-        EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i;
+
+        if (id == test_elid)
+        {
+            EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i;
+            EXPECT_TRUE(aper->is_in(p1[0], p1[1]));
+        }
+        else
+        {
+            EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM);
+            EXPECT_FALSE(aper->is_in(p1[0], p1[1]));
+        }
     }
 }
 
@@ -203,7 +267,7 @@ TEST(OptixRunner, ParabolaRectangle)
     auto aper = make_aperture<Rectangle>(XL, YL);
 
     SimulationData sd;
-    set_default_sd(sd, surf, aper);
+    element_id test_elid = set_default_sd(sd, surf, aper);
     SimulationResult result;
 
     OptixRunner runner;
@@ -225,10 +289,21 @@ TEST(OptixRunner, ParabolaRectangle)
         glm::dvec3 p0, p1;
         rr->get_position(0, p0);
         rr->get_position(1, p1);
+        auto id = rr->get_element(1);
         EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i;
         EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i;
-        const double z1 = Z_ELEM + 0.5 * CX * p1[0] * p1[0] + 0.5 * CY * p1[1] * p1[1];
-        EXPECT_NEAR(p1[2], z1, TOL * Z_ELEM) << "ray " << i;
+
+        if (id == test_elid)
+        {
+            const double z1 = Z_ELEM + 0.5 * CX * p1[0] * p1[0] + 0.5 * CY * p1[1] * p1[1];
+            EXPECT_NEAR(p1[2], z1, TOL * Z_ELEM) << "ray " << i;
+            EXPECT_TRUE(aper->is_in(p1[0], p1[1]));
+        }
+        else
+        {
+            EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM);
+            EXPECT_FALSE(aper->is_in(p1[0], p1[1]));
+        }
     }
 }
 
@@ -240,7 +315,7 @@ TEST(OptixRunner, Cylinder)
     auto aper = make_aperture<Rectangle>(2 * R, YL);
 
     SimulationData sd;
-    set_default_sd(sd, surf, aper);
+    element_id test_elid = set_default_sd(sd, surf, aper);
     SimulationResult result;
 
     OptixRunner runner;
@@ -262,9 +337,160 @@ TEST(OptixRunner, Cylinder)
         glm::dvec3 p0, p1;
         rr->get_position(0, p0);
         rr->get_position(1, p1);
+        auto id = rr->get_element(1);
         EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i;
         EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i;
-        const double z1 = Z_ELEM + sqrt(R * R - p1[0] * p1[0]);
-        EXPECT_NEAR(p1[2], z1, TOL * Z_ELEM) << "ray " << i;
+
+        if (id == test_elid)
+        {
+            const double z1 = Z_ELEM + sqrt(R * R - p1[0] * p1[0]);
+            EXPECT_NEAR(p1[2], z1, TOL * Z_ELEM) << "ray " << i;
+            EXPECT_TRUE(aper->is_in(p1[0], p1[1]));
+        }
+        else
+        {
+            EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM);
+            EXPECT_FALSE(aper->is_in(p1[0], p1[1]));
+        }
+    }
+}
+
+TEST(OptixRunner, FlatCircle)
+{
+    const double R = 5.0;
+    auto surf = make_surface<Flat>();
+    auto aper = make_aperture<Circle>(2 * R);
+
+    SimulationData sd;
+    element_id test_elid = set_default_sd(sd, surf, aper);
+    SimulationResult result;
+
+    OptixRunner runner;
+    RunnerStatus sts = runner.initialize();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.setup_simulation(&sd);
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.run_simulation();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.report_simulation(&result, 0);
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+
+    ASSERT_EQ(result.get_number_of_records(),
+              sd.get_simulation_parameters().number_of_rays);
+    for (int i = 0; i < (int)result.get_number_of_records(); ++i)
+    {
+        auto rr = result[i];
+        ASSERT_GE(rr->get_number_of_interactions(), 2);
+        glm::dvec3 p0, p1;
+        rr->get_position(0, p0);
+        rr->get_position(1, p1);
+        auto id = rr->get_element(1);
+        EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i;
+        EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i;
+
+        if (id == test_elid)
+        {
+            EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i;
+            EXPECT_TRUE(aper->is_in(p1[0], p1[1]));
+        }
+        else
+        {
+            EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM);
+            EXPECT_FALSE(aper->is_in(p1[0], p1[1]));
+        }
+    }
+}
+
+TEST(OptixRunner, FlatHexagon)
+{
+    const double S = 5.0;
+    auto surf = make_surface<Flat>();
+    auto aper = make_aperture<Hexagon>(2 * S);
+
+    SimulationData sd;
+    element_id test_elid = set_default_sd(sd, surf, aper);
+    SimulationResult result;
+
+    OptixRunner runner;
+    RunnerStatus sts = runner.initialize();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.setup_simulation(&sd);
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.run_simulation();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.report_simulation(&result, 0);
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+
+    ASSERT_EQ(result.get_number_of_records(),
+              sd.get_simulation_parameters().number_of_rays);
+    for (int i = 0; i < (int)result.get_number_of_records(); ++i)
+    {
+        auto rr = result[i];
+        ASSERT_GE(rr->get_number_of_interactions(), 2);
+        glm::dvec3 p0, p1;
+        rr->get_position(0, p0);
+        rr->get_position(1, p1);
+        auto id = rr->get_element(1);
+        EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i;
+        EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i;
+
+        if (id == test_elid)
+        {
+            EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i;
+            EXPECT_TRUE(aper->is_in(p1[0], p1[1]));
+        }
+        else
+        {
+            EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM);
+            EXPECT_FALSE(aper->is_in(p1[0], p1[1]));
+        }
+    }
+}
+
+TEST(OptixRunner, FlatAnnulus)
+{
+    const double R0 = 5.0;
+    const double R1 = 10.0;
+    const double ARC = 2 * PI;
+    auto surf = make_surface<Flat>();
+    auto aper = make_aperture<Annulus>(R0, R1, ARC);
+
+    SimulationData sd;
+    element_id test_elid = set_default_sd(sd, surf, aper);
+    SimulationResult result;
+
+    OptixRunner runner;
+    RunnerStatus sts = runner.initialize();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.setup_simulation(&sd);
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.run_simulation();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.report_simulation(&result, 0);
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+
+    ASSERT_EQ(result.get_number_of_records(),
+              sd.get_simulation_parameters().number_of_rays);
+    for (int i = 0; i < (int)result.get_number_of_records(); ++i)
+    {
+        auto rr = result[i];
+        ASSERT_GE(rr->get_number_of_interactions(), 2);
+        glm::dvec3 p0, p1;
+        rr->get_position(0, p0);
+        rr->get_position(1, p1);
+        auto id = rr->get_element(1);
+        EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i;
+        EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i;
+
+        if (id == test_elid)
+        {
+            EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i;
+            EXPECT_TRUE(aper->is_in(p1[0], p1[1]));
+        }
+        else
+        {
+            EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM);
+            EXPECT_FALSE(aper->is_in(p1[0], p1[1]));
+        }
     }
 }
diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/gpu_tower_demo.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/gpu_tower_demo.cpp
index ea771d5a..f99fb8e8 100644
--- a/google-tests/unit-tests/simulation_runner/optix_runner/gpu_tower_demo.cpp
+++ b/google-tests/unit-tests/simulation_runner/optix_runner/gpu_tower_demo.cpp
@@ -159,3 +159,80 @@ TEST(GpuTowerDemo, OptixRunnerWithStages)
     // sts = runner.report_simulation();
     ASSERT_EQ(sts, RunnerStatus::SUCCESS);
 }
+
+static void setup_tower_sd(SimulationData &sd, uint_fast64_t nrays)
+{
+    auto sun = make_ray_source<Sun>();
+    sun->set_position(0.0, 0.0, 100.0);
+    sd.add_ray_source(sun);
+
+    auto absorber = make_element<SingleElement>();
+    absorber->set_origin(0.0, 0.0, 10.0);
+    absorber->set_aim_vector(0.0, 5.0, 0.0);
+    absorber->set_surface(make_surface<Flat>());
+    absorber->set_aperture(make_aperture<Rectangle>(2.0, 2.0));
+    absorber->get_front_optical_properties()->set_ideal_absorption();
+
+    auto st1 = make_stage(1);
+    st1->set_origin(0.0, 0.0, 0.0);
+    st1->set_aim_vector(0.0, 0.0, 1.0);
+    st1->add_element(absorber);
+
+    auto st0 = make_stage(0);
+    st0->set_origin(0.0, 0.0, 0.0);
+    st0->set_aim_vector(0.0, 0.0, 1.0);
+
+    const double spacing = PI / 4.0;
+    for (int i = -1; i < 4; ++i)
+    {
+        auto el = make_element<SingleElement>();
+        el->get_front_optical_properties()->reflectivity = 1.0;
+
+        glm::dvec3 pos = {5 * sin(i * spacing), 5 * cos(i * spacing), 0.0};
+        el->set_origin(pos);
+        glm::dvec3 rvec = glm::normalize(absorber->get_origin_global() - pos);
+        glm::dvec3 svec = glm::normalize(sun->get_position());
+        glm::dvec3 avec = 0.5 * rvec + 0.5 * svec;
+        el->set_aim_vector(pos + 100.0 * avec);
+        el->set_zrot(30.0 * i);
+        el->set_surface(make_surface<Flat>());
+        el->set_aperture(make_aperture<Rectangle>(1.0, 1.95));
+        st0->add_element(el);
+    }
+
+    sd.add_stage(st0);
+    sd.add_stage(st1);
+
+    SimulationParameters &params = sd.get_simulation_parameters();
+    params.number_of_rays = nrays;
+    params.max_number_of_rays = nrays * 100;
+    params.include_optical_errors = false;
+    params.include_sun_shape_errors = false;
+    params.seed = 12345;
+}
+
+TEST(OptixRunner, RaysLaunchedEqualsRequestedAfterRun)
+{
+    const uint_fast64_t NRAYS = 100;
+    SimulationData sd;
+    setup_tower_sd(sd, NRAYS);
+
+    OptixRunner runner;
+    RunnerStatus sts = runner.initialize();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.setup_simulation(&sd);
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+
+    // Before running, no rays have been launched yet
+    EXPECT_EQ(runner.get_number_rays_launched(), static_cast<uint_fast64_t>(0));
+
+    sts = runner.run_simulation();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+
+    EXPECT_GE(runner.get_number_rays_launched(), NRAYS);
+
+    const uint_fast64_t rays_traced = runner.get_number_rays_traced();
+    EXPECT_GT(rays_traced, static_cast<uint_fast64_t>(0));
+    EXPECT_LE(rays_traced, runner.get_number_rays_launched());
+    EXPECT_EQ(rays_traced, NRAYS);
+}
diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp
new file mode 100644
index 00000000..59702454
--- /dev/null
+++ b/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp
@@ -0,0 +1,234 @@
+#include <gtest/gtest.h>
+
+#include <optix_runner.hpp>
+#include <simulation_data.hpp>
+#include <simulation_data_export.hpp>
+#include <simulation_result_export.hpp>
+#include <shaders/Soltrace.h>        // OptixCSP::DEFAULT_MAX_TRACE_DEPTH
+#include <core/soltrace_constants.h> // OptixCSP::HitType
+
+using SolTrace::Runner::RunnerStatus;
+
+// Reuse the two-plate scene defined in two_plate_test.cpp.
+// Plate 1 is ideal-reflective; plate 2 is ideal-absorptive.
+// Each ray that hits the scene makes exactly 2 element interactions:
+//   1. reflect off plate 1
+//   2. absorb on plate 2
+void make_two_plate_sd(SimulationData &sd, element_ptr &plate1, element_ptr &plate2);
+
+// ---------------------------------------------------------------------------
+// set_max_ray_depth / get_max_ray_depth accessor tests (no GPU required)
+// ---------------------------------------------------------------------------
+
+TEST(OptixRunnerMaxRayDepth, DefaultIsDefaultMaxTraceDepth)
+{
+    OptixRunner runner;
+    EXPECT_EQ(runner.get_max_ray_depth(), OptixCSP::DEFAULT_MAX_TRACE_DEPTH);
+}
+
+TEST(OptixRunnerMaxRayDepth, SetAndGet)
+{
+    OptixRunner runner;
+    runner.set_max_ray_depth(10);
+    EXPECT_EQ(runner.get_max_ray_depth(), 10u);
+}
+
+// Minimum valid depth is 2; setting exactly 2 must be accepted without clamping.
+TEST(OptixRunnerMaxRayDepth, MinimumBoundaryAccepted)
+{
+    OptixRunner runner;
+    runner.set_max_ray_depth(2);
+    EXPECT_EQ(runner.get_max_ray_depth(), 2u);
+}
+
+// Maximum valid depth is 255; setting exactly 255 must be accepted without clamping.
+TEST(OptixRunnerMaxRayDepth, MaximumBoundaryAccepted)
+{
+    OptixRunner runner;
+    runner.set_max_ray_depth(255);
+    EXPECT_EQ(runner.get_max_ray_depth(), 255u);
+}
+
+// Depth of 0 is below the minimum; must be clamped to 2 with a warning.
+TEST(OptixRunnerMaxRayDepth, ZeroClampsToMinimum)
+{
+    OptixRunner runner;
+    testing::internal::CaptureStderr();
+    runner.set_max_ray_depth(0);
+    const std::string output = testing::internal::GetCapturedStderr();
+    EXPECT_EQ(runner.get_max_ray_depth(), 2u);
+    EXPECT_NE(output.find("WARNING"), std::string::npos);
+}
+
+// Depth of 1 is below the minimum; must be clamped to 2 with a warning.
+TEST(OptixRunnerMaxRayDepth, OneClampsToMinimum)
+{
+    OptixRunner runner;
+    testing::internal::CaptureStderr();
+    runner.set_max_ray_depth(1);
+    const std::string output = testing::internal::GetCapturedStderr();
+    EXPECT_EQ(runner.get_max_ray_depth(), 2u);
+    EXPECT_NE(output.find("WARNING"), std::string::npos);
+}
+
+// Depth of 256 exceeds the maximum; must be clamped to 255 with a warning.
+TEST(OptixRunnerMaxRayDepth, ExceedingMaxClampsTo255)
+{
+    OptixRunner runner;
+    testing::internal::CaptureStderr();
+    runner.set_max_ray_depth(256);
+    const std::string output = testing::internal::GetCapturedStderr();
+    EXPECT_EQ(runner.get_max_ray_depth(), 255u);
+    EXPECT_NE(output.find("WARNING"), std::string::npos);
+}
+
+// Large values (e.g. passing a big int) must also clamp to 255 with a warning.
+TEST(OptixRunnerMaxRayDepth, LargeValueClampsTo255)
+{
+    OptixRunner runner;
+    testing::internal::CaptureStderr();
+    runner.set_max_ray_depth(10000);
+    const std::string output = testing::internal::GetCapturedStderr();
+    EXPECT_EQ(runner.get_max_ray_depth(), 255u);
+    EXPECT_NE(output.find("WARNING"), std::string::npos);
+}
+
+// An out-of-range value is always clamped and stored, replacing any previous value.
+TEST(OptixRunnerMaxRayDepth, ClampedValueOverwritesPreviousValue)
+{
+    OptixRunner runner;
+    runner.set_max_ray_depth(20);
+    ASSERT_EQ(runner.get_max_ray_depth(), 20u);
+
+    testing::internal::CaptureStderr();
+    runner.set_max_ray_depth(0);
+    const std::string output = testing::internal::GetCapturedStderr();
+    EXPECT_EQ(runner.get_max_ray_depth(), 2u);
+    EXPECT_NE(output.find("WARNING"), std::string::npos);
+}
+
+// Setting a valid depth after an out-of-range call must work normally.
+TEST(OptixRunnerMaxRayDepth, ValidSetAfterClamp)
+{
+    OptixRunner runner;
+    testing::internal::CaptureStderr();
+    runner.set_max_ray_depth(0);
+    testing::internal::GetCapturedStderr();
+
+    runner.set_max_ray_depth(8);
+    EXPECT_EQ(runner.get_max_ray_depth(), 8u);
+}
+
+// ---------------------------------------------------------------------------
+// GPU trace test: verify the depth limit is actually enforced during tracing.
+//
+// The two-plate scene produces exactly 2 element interactions per ray under
+// normal conditions (reflect off plate 1, absorb on plate 2).  Setting
+// max_ray_depth = 2 reduces the maximum interactions per ray to 1, so the
+// second interaction is cut off.  We verify that no ray in the hit buffer
+// carries more than (max_ray_depth - 1) element interactions.
+// ---------------------------------------------------------------------------
+TEST(OptixRunnerMaxRayDepth, TraceDepthNotExceeded)
+{
+    SimulationData sd;
+    element_ptr plate1, plate2;
+    make_two_plate_sd(sd, plate1, plate2);
+
+    const uint_fast64_t test_max_depth = 2; // allows at most 1 interaction per ray
+    OptixRunner runner;
+    runner.set_max_ray_depth(test_max_depth);
+    ASSERT_EQ(runner.get_max_ray_depth(), static_cast<uint8_t>(test_max_depth));
+
+    RunnerStatus sts = runner.initialize();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.setup_simulation(&sd);
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.run_simulation();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+
+    OptixCSP::SolTraceSystem *sys = runner.get_optix_system();
+    std::vector<float4> hp_vec;
+    std::vector<uint_fast64_t> raynumber_vec;
+    std::vector<int32_t> element_id_vec;
+    std::vector<uint8_t> hit_type_vec;
+    sys->get_hp_output(hp_vec, raynumber_vec, element_id_vec, hit_type_vec);
+
+    // Walk through every record.  HIT_CREATE marks the start of a new ray;
+    // all other types are element interactions.  No ray may accumulate more
+    // than (max_ray_depth - 1) interactions.
+    const uint_fast64_t max_interactions = test_max_depth - 1;
+    uint_fast64_t current_interactions = 0;
+    for (uint8_t ht : hit_type_vec)
+    {
+        if (ht == OptixCSP::HitType::HIT_CREATE)
+        {
+            current_interactions = 0;
+        }
+        else
+        {
+            ++current_interactions;
+            EXPECT_LE(current_interactions, max_interactions)
+                << "A ray exceeded max_ray_depth - 1 element interactions";
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// GPU trace tests: verify the depth-exceeded counter behaves correctly.
+// ---------------------------------------------------------------------------
+
+// With max_depth at the default (5), the two-plate scene (reflect→absorb) uses
+// only 2 depth slots per ray — well within the limit.  The counter must stay 0.
+TEST(OptixRunnerMaxRayDepth, DepthExceededCounterIsZeroWhenDepthNotExceeded)
+{
+    SimulationData sd;
+    element_ptr plate1, plate2;
+    make_two_plate_sd(sd, plate1, plate2);
+
+    OptixRunner runner;
+    // Leave max_ray_depth at the default (5); the scene only needs depth 2.
+    RunnerStatus sts = runner.initialize();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.setup_simulation(&sd);
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.run_simulation();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+
+    EXPECT_EQ(runner.get_N_depth_exceeded_rays(), 0u);
+}
+
+// With max_depth=2, the two-plate scene where BOTH plates are reflective forces
+// rays that reach plate 2 to exceed the depth limit (new_depth=2 >= max_depth=2,
+// and plate 2 is non-absorbing).  Not all reflections from plate 1 geometrically
+// reach plate 2, so counter <= n_hit.  The key check is that counter > 0 and
+// never exceeds the total number of hit rays.
+TEST(OptixRunnerMaxRayDepth, DepthExceededCounterCountsTerminatedReflectedRays)
+{
+    SimulationData sd;
+    element_ptr plate1, plate2;
+    make_two_plate_sd(sd, plate1, plate2);
+
+    // Override plate 2 to be reflective so hitting it at max_depth triggers the counter.
+    plate2->get_front_optical_properties()->set_ideal_reflection();
+    plate2->get_back_optical_properties()->set_ideal_reflection();
+
+    OptixRunner runner;
+    runner.set_max_ray_depth(2); // max interactions per ray = 1 (plate 1 only)
+    ASSERT_EQ(runner.get_max_ray_depth(), 2u);
+
+    RunnerStatus sts = runner.initialize();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.setup_simulation(&sd);
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+    sts = runner.run_simulation();
+    ASSERT_EQ(sts, RunnerStatus::SUCCESS);
+
+    const uint_fast64_t counter = runner.get_N_depth_exceeded_rays();
+    const uint_fast64_t n_hit = runner.get_number_rays_traced();
+
+    // Every ray that reached plate 2 (a subset of the plate 1 hits —
+    // some reflections miss plate 2 entirely) must have incremented the counter.
+    EXPECT_GT(counter, 0u) << "Expected depth-exceeded counter to be non-zero";
+    EXPECT_LE(counter, n_hit)
+        << "Counter cannot exceed the total number of hit rays";
+}
diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/ray_position_sampling_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/ray_position_sampling_test.cpp
new file mode 100644
index 00000000..244a9687
--- /dev/null
+++ b/google-tests/unit-tests/simulation_runner/optix_runner/ray_position_sampling_test.cpp
@@ -0,0 +1,309 @@
+// ray_position_sampling_test.cpp
+//
+// Tests for sun parallelogram position sampling (GenType::HALTON and
+// GenType::RANDOM).  Each test fires rays straight down onto a large flat plate
+// with no sun-shape or optical errors, then inspects position[0] of every ray
+// record — the ray-generation point on the sun parallelogram — to verify the
+// spatial distribution.
+
+#include <gtest/gtest.h>
+
+#include <optix_runner.hpp>
+#include <simulation_data.hpp>
+#include <simulation_data_export.hpp>
+#include <simulation_result_export.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <numeric>
+
+using SolTrace::Runner::RunnerStatus;
+
+namespace
+{
+    // ---- Minimal KS-test infrastructure ------------------------------------
+    // (duplicated from sun_test.cpp — kept local to this TU)
+
+    static double clamp01(double x)
+    {
+        return x <= 0.0 ? 0.0 : (x >= 1.0 ? 1.0 : x);
+    }
+
+    static double ks_statistic(const std::vector<double>& samples,
+                               const std::function<double(double)>& cdf)
+    {
+        if (samples.empty()) return 1.0;
+        std::vector<double> sorted = samples;
+        std::sort(sorted.begin(), sorted.end());
+        const double n = static_cast<double>(sorted.size());
+        double d = 0.0;
+        for (size_t i = 0; i < sorted.size(); ++i)
+        {
+            const double f      = clamp01(cdf(sorted[i]));
+            const double emp_lo = static_cast<double>(i) / n;
+            const double emp_hi = static_cast<double>(i + 1) / n;
+            d = std::max(d, std::abs(f - emp_lo));
+            d = std::max(d, std::abs(emp_hi - f));
+        }
+        return d;
+    }
+
+    static double ks_pvalue_asymptotic(double d, size_t n)
+    {
+        if (n == 0) return 0.0;
+        if (d <= 0.0) return 1.0;
+        const double sqrtn = std::sqrt(static_cast<double>(n));
+        const double x = (sqrtn + 0.12 + 0.11 / sqrtn) * d;
+        double sum = 0.0;
+        for (int k = 1; k <= 100; ++k)
+        {
+            const double term = std::exp(-2.0 * k * k * x * x);
+            sum += (k % 2 == 1) ? term : -term;
+            if (term < 1.0e-12) break;
+        }
+        return clamp01(2.0 * sum);
+    }
+
+    static double ks_pvalue(const std::vector<double>& samples,
+                            const std::function<double(double)>& cdf)
+    {
+        return ks_pvalue_asymptotic(ks_statistic(samples, cdf), samples.size());
+    }
+
+    // Test that `coords` (a 1-D projection of source positions) is consistent
+    // with a uniform distribution.  Coordinates are normalised to [0,1] using
+    // the empirical range before the KS test.  For N >= 1 000 from a true
+    // uniform distribution the normalisation bias is O(1/N) — negligible
+    // against the ~1% critical value.
+    static double ks_pvalue_uniform1d(const std::vector<double>& coords)
+    {
+        if (coords.size() < 2) return 0.0;
+        const double lo = *std::min_element(coords.begin(), coords.end());
+        const double hi = *std::max_element(coords.begin(), coords.end());
+        if (hi - lo < 1.0e-9) return 0.0;
+
+        std::vector<double> u;
+        u.reserve(coords.size());
+        for (double c : coords)
+            u.push_back((c - lo) / (hi - lo));
+
+        return ks_pvalue(u, [](double x) { return x; });
+    }
+
+    // ---- Scene helpers -----------------------------------------------------
+
+    // Build a SimulationData with a large flat plate at z=50 (200 × 200 world
+    // units), perfectly reflective, no optical or sun-shape errors.  The plate
+    // is intentionally much larger than any realistic sun parallelogram so that
+    // all generated rays hit it and every source position is recorded.
+    static void make_large_plate_scene(SimulationData& sd, int seed = 42)
+    {
+        sd.clear();
+
+        auto stage = make_stage(0);
+        stage->set_origin(0, 0, 0);
+        stage->set_aim_vector(0, 0, 1);
+        stage->set_name("stage");
+
+        auto plate = make_element<SingleElement>();
+        plate->set_origin(0, 0, 50);
+        plate->set_aim_vector(0, 0, 100);
+        plate->set_surface(make_surface<Flat>());
+        plate->set_aperture(make_aperture<Rectangle>(200, 200));
+        OpticalProperties op(InteractionType::REFLECTION,
+                             DistributionType::NONE,
+                             0, 1, 0, 0, 0, 0);
+        plate->set_front_optical_properties(op);
+        plate->set_back_optical_properties(op);
+        plate->set_name("plate");
+
+        stage->add_element(plate);
+        sd.add_stage(stage);
+
+        SimulationParameters& params       = sd.get_simulation_parameters();
+        params.number_of_rays              = 20000;
+        params.max_number_of_rays          = params.number_of_rays * 10;
+        params.include_optical_errors      = false;
+        params.include_sun_shape_errors    = false;
+        params.seed                        = seed;
+    }
+
+    // Add a sun pointing straight down (position overhead at z=100) to an
+    // already-configured SimulationData, using the requested gen_type.
+    static void add_sun(SimulationData& sd, SolTrace::Data::GenType gen_type)
+    {
+        auto sun = make_ray_source<Sun>();
+        sun->set_position(0, 0, 100);
+        // PILLBOX shape with no sun-shape errors keeps rays exactly parallel,
+        // so hit positions on the plate are the source positions projected down.
+        sun->set_shape(SolTrace::Data::SunShape::PILLBOX, 0.0, 4.65, 0.0);
+        sun->set_gen_type(gen_type);
+        sd.add_ray_source(sun);
+    }
+
+    // Run the simulation and populate `result`.  Returns false on any failure.
+    static bool run_sim(SimulationData& sd, SimulationResult& result)
+    {
+        OptixRunner runner;
+        if (runner.initialize()           != RunnerStatus::SUCCESS) return false;
+        if (runner.setup_simulation(&sd)  != RunnerStatus::SUCCESS) return false;
+        if (runner.run_simulation()       != RunnerStatus::SUCCESS) return false;
+        if (runner.report_simulation(&result, 0) != RunnerStatus::SUCCESS) return false;
+        return true;
+    }
+
+    // Extract the X and Y components of position[0] (the ray-generation point)
+    // for every ray that has at least one surface interaction recorded.
+    static void collect_source_xy(const SimulationResult& result,
+                                  std::vector<double>& xs,
+                                  std::vector<double>& ys)
+    {
+        xs.clear();
+        ys.clear();
+        for (int i = 0; i < result.get_number_of_records(); ++i)
+        {
+            auto rec = result[i];
+            if (!rec || rec->get_number_of_interactions() < 1) continue;
+            glm::dvec3 p;
+            rec->get_position(0, p);
+            xs.push_back(p[0]);
+            ys.push_back(p[1]);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+// GenType::RANDOM: the marginal X and Y distributions of source positions
+// should be statistically consistent with a uniform distribution over the
+// sun parallelogram.
+TEST(RayPositionSampling, Random_UniformMarginals)
+{
+    SimulationData sd;
+    make_large_plate_scene(sd);
+    add_sun(sd, SolTrace::Data::GenType::RANDOM);
+
+    SimulationResult result;
+    ASSERT_TRUE(run_sim(sd, result));
+    ASSERT_GT(result.get_number_of_records(), 0);
+
+    std::vector<double> xs, ys;
+    collect_source_xy(result, xs, ys);
+    ASSERT_GE(xs.size(), 1000u);
+
+    const double p_x = ks_pvalue_uniform1d(xs);
+    const double p_y = ks_pvalue_uniform1d(ys);
+
+    EXPECT_GT(p_x, 1.0e-6) << "X marginal deviates significantly from uniform";
+    EXPECT_GT(p_y, 1.0e-6) << "Y marginal deviates significantly from uniform";
+}
+
+// GenType::HALTON: same uniformity requirement, via the Halton low-discrepancy
+// sequence (bases 2 and 3 for the two parallelogram axes).
+TEST(RayPositionSampling, Halton_UniformMarginals)
+{
+    SimulationData sd;
+    make_large_plate_scene(sd);
+    add_sun(sd, SolTrace::Data::GenType::HALTON);
+
+    SimulationResult result;
+    ASSERT_TRUE(run_sim(sd, result));
+    ASSERT_GT(result.get_number_of_records(), 0);
+
+    std::vector<double> xs, ys;
+    collect_source_xy(result, xs, ys);
+    ASSERT_GE(xs.size(), 1000u);
+
+    const double p_x = ks_pvalue_uniform1d(xs);
+    const double p_y = ks_pvalue_uniform1d(ys);
+
+    EXPECT_GT(p_x, 1.0e-6) << "X marginal deviates significantly from uniform";
+    EXPECT_GT(p_y, 1.0e-6) << "Y marginal deviates significantly from uniform";
+}
+
+// GenType::HALTON is deterministic: the seed field has no effect because the
+// sequence depends only on the ray index.  Two runs with different seeds must
+// produce the same set of source positions.
+TEST(RayPositionSampling, Halton_Deterministic)
+{
+    // Run 1: seed = 1
+    SimulationData sd1;
+    make_large_plate_scene(sd1, /*seed=*/1);
+    add_sun(sd1, SolTrace::Data::GenType::HALTON);
+
+    SimulationResult r1;
+    ASSERT_TRUE(run_sim(sd1, r1));
+
+    // Run 2: seed = 99999 (different, should be ignored by Halton)
+    SimulationData sd2;
+    make_large_plate_scene(sd2, /*seed=*/99999);
+    add_sun(sd2, SolTrace::Data::GenType::HALTON);
+
+    SimulationResult r2;
+    ASSERT_TRUE(run_sim(sd2, r2));
+
+    ASSERT_EQ(r1.get_number_of_records(), r2.get_number_of_records());
+
+    std::vector<double> xs1, ys1, xs2, ys2;
+    collect_source_xy(r1, xs1, ys1);
+    collect_source_xy(r2, xs2, ys2);
+    ASSERT_EQ(xs1.size(), xs2.size());
+
+    // Sort both and compare element-by-element.
+    std::sort(xs1.begin(), xs1.end());
+    std::sort(xs2.begin(), xs2.end());
+    std::sort(ys1.begin(), ys1.end());
+    std::sort(ys2.begin(), ys2.end());
+
+    for (size_t i = 0; i < xs1.size(); ++i)
+    {
+        EXPECT_NEAR(xs1[i], xs2[i], 1.0e-4)
+            << "Halton X differs at sorted index " << i
+            << " — sequence is not deterministic across seeds";
+        EXPECT_NEAR(ys1[i], ys2[i], 1.0e-4)
+            << "Halton Y differs at sorted index " << i
+            << " — sequence is not deterministic across seeds";
+    }
+}
+
+// GenType::RANDOM IS seed-dependent: two runs with different seeds must
+// produce distinguishably different position sets.
+TEST(RayPositionSampling, Random_SeedDependent)
+{
+    SimulationData sd1;
+    make_large_plate_scene(sd1, /*seed=*/1);
+    add_sun(sd1, SolTrace::Data::GenType::RANDOM);
+
+    SimulationData sd2;
+    make_large_plate_scene(sd2, /*seed=*/2);
+    add_sun(sd2, SolTrace::Data::GenType::RANDOM);
+
+    SimulationResult r1, r2;
+    ASSERT_TRUE(run_sim(sd1, r1));
+    ASSERT_TRUE(run_sim(sd2, r2));
+    ASSERT_GT(r1.get_number_of_records(), 0);
+    ASSERT_GT(r2.get_number_of_records(), 0);
+
+    std::vector<double> xs1, ys1, xs2, ys2;
+    collect_source_xy(r1, xs1, ys1);
+    collect_source_xy(r2, xs2, ys2);
+    ASSERT_FALSE(xs1.empty());
+    ASSERT_FALSE(xs2.empty());
+
+    const double mean_x1 = std::accumulate(xs1.begin(), xs1.end(), 0.0) / xs1.size();
+    const double mean_x2 = std::accumulate(xs2.begin(), xs2.end(), 0.0) / xs2.size();
+    const double mean_y1 = std::accumulate(ys1.begin(), ys1.end(), 0.0) / ys1.size();
+    const double mean_y2 = std::accumulate(ys2.begin(), ys2.end(), 0.0) / ys2.size();
+
+    // With 20 000 rays and different seeds the probability that both means
+    // coincide to within 1e-4 is negligible.
+    const bool x_differs = std::abs(mean_x1 - mean_x2) > 1.0e-4;
+    const bool y_differs = std::abs(mean_y1 - mean_y2) > 1.0e-4;
+
+    EXPECT_TRUE(x_differs || y_differs)
+        << "Seeds 1 and 2 produced indistinguishable mean source positions; "
+           "seed variation may not be wired up for RANDOM gen_type.";
+}
diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp
index c2ff398e..5db70cdf 100644
--- a/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp
+++ b/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp
@@ -804,3 +804,58 @@ TEST(Sun, UserDefinedSunAngleDistribution)
     EXPECT_GT(frac_beyond_disc, 0.0);
     EXPECT_LT(frac_beyond_disc, 0.05);
 }
+
+// Verify that a warning is printed to stderr when Halton ray generation is used
+// with max_number_of_rays exceeding UINT32_MAX, since the Halton index is truncated
+// to 32 bits and positions will repeat.
+TEST(Sun, HaltonWarningWhenExceedingUInt32Max)
+{
+    SimulationData sd;
+    element_ptr plate;
+    make_default_sd_sun(sd, plate);
+
+    auto sun = make_ray_source<Sun>();
+    sun->set_position(0, 0, 100);
+    sun->set_gen_type(SolTrace::Data::GenType::HALTON);
+    sd.add_ray_source(sun);
+
+    SimulationParameters& params = sd.get_simulation_parameters();
+    params.number_of_rays = 1000;
+    params.max_number_of_rays = static_cast<uint_fast64_t>(std::numeric_limits<uint32_t>::max()) + 1;
+
+    OptixRunner runner;
+    testing::internal::CaptureStderr();
+    ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS);
+    const std::string output = testing::internal::GetCapturedStderr();
+
+    EXPECT_NE(output.find("Halton"), std::string::npos)
+        << "Expected a warning about Halton index overflow in stderr, but got: " << output;
+}
+
+// Verify that no warning is printed when Halton is used with max_number_of_rays
+// within the 32-bit range.
+TEST(Sun, NoHaltonWarningWithinUInt32Range)
+{
+    SimulationData sd;
+    element_ptr plate;
+    make_default_sd_sun(sd, plate);
+
+    auto sun = make_ray_source<Sun>();
+    sun->set_position(0, 0, 100);
+    sun->set_gen_type(SolTrace::Data::GenType::HALTON);
+    sd.add_ray_source(sun);
+
+    SimulationParameters& params = sd.get_simulation_parameters();
+    params.number_of_rays = 1000;
+    params.max_number_of_rays = static_cast<uint_fast64_t>(std::numeric_limits<uint32_t>::max());
+
+    OptixRunner runner;
+    testing::internal::CaptureStderr();
+    ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS);
+    const std::string output = testing::internal::GetCapturedStderr();
+
+    // Checking for "Warning" may result in failure due to other warning. Since
+    // this should be clean, I consider that to be a good thing.
+    EXPECT_EQ(output.find("Warning"), std::string::npos)
+        << "Unexpected Halton warning in stderr: " << output;
+}
diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp
index 17385bee..b88f5916 100644
--- a/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp
+++ b/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp
@@ -151,12 +151,12 @@ TEST(TwoPlateOptix, ReflectionToAbsorber)
 	std::vector<int32_t> element_id_vec;
 	std::vector<uint8_t> hit_type_vec;
 	sys->get_hp_output(hp_vec, raynumber_vec, element_id_vec, hit_type_vec);
-	std::vector<uint_fast64_t> sunraynumber_vec = sys->get_sunraynumber_vec();
+	// std::vector<uint_fast64_t> sunraynumber_vec = sys->get_sunraynumber_vec();
 
 	EXPECT_EQ(hp_vec.size(), raynumber_vec.size());						// Hit results are same size
 	EXPECT_EQ(raynumber_vec.size(), element_id_vec.size());
 	EXPECT_EQ(element_id_vec.size(), hit_type_vec.size());
-	EXPECT_EQ(N_sun_rays, sunraynumber_vec.back());						// Reported sun rays is the sun ray id of last hit
+	// EXPECT_EQ(N_sun_rays, sunraynumber_vec.back());						// Reported sun rays is the sun ray id of last hit
 	EXPECT_TRUE(N_sun_rays <= sd.get_simulation_parameters().max_number_of_rays);	// Only generated max number of rays or fewer
 }
 
@@ -245,4 +245,41 @@ TEST(TwoPlateOptix, SimResults)
 		}
 	}
 	
-}
\ No newline at end of file
+}
+
+TEST(TwoPlateOptix, TrimExcessRaysOption)
+{
+	SimulationData sd;
+	element_ptr plate1, plate2;
+	make_two_plate_sd(sd, plate1, plate2);
+	const int n_rays = sd.get_simulation_parameters().number_of_rays;
+
+	// Default: trim enabled — result has exactly n_rays records
+	{
+		OptixRunner runner;
+		EXPECT_TRUE(runner.get_trim_excess_rays());  // default is true
+
+		ASSERT_EQ(runner.initialize(), RunnerStatus::SUCCESS);
+		ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS);
+		ASSERT_EQ(runner.run_simulation(), RunnerStatus::SUCCESS);
+
+		SimulationResult result;
+		ASSERT_EQ(runner.report_simulation(&result, 0), RunnerStatus::SUCCESS);
+		EXPECT_EQ(result.get_number_of_records(), n_rays);
+	}
+
+	// Trim disabled — result has at least n_rays records (batch overshoot is not removed)
+	{
+		OptixRunner runner;
+		runner.set_trim_excess_rays(false);
+		EXPECT_FALSE(runner.get_trim_excess_rays());
+
+		ASSERT_EQ(runner.initialize(), RunnerStatus::SUCCESS);
+		ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS);
+		ASSERT_EQ(runner.run_simulation(), RunnerStatus::SUCCESS);
+
+		SimulationResult result;
+		ASSERT_EQ(runner.report_simulation(&result, 0), RunnerStatus::SUCCESS);
+		EXPECT_GE(result.get_number_of_records(), n_rays);
+	}
+}