From e59fc3b75bc2c161429b19afd836d594c1ed6ffa Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Fri, 27 Mar 2026 16:13:32 -0600 Subject: [PATCH 01/60] Initial run at adding flat circle to OptixRunner --- .../OptixCSP/src/core/CspElement.cpp | 10 + .../OptixCSP/src/core/geometry_manager.cpp | 19 +- .../OptixCSP/src/shaders/GeometryDataST.h | 29 +- .../OptixCSP/src/shaders/Soltrace.h | 1 + .../OptixCSP/src/shaders/intersection.cu | 265 ++++++++++-------- .../optix_runner/optix_runner.cpp | 9 + .../geometry_intersection_test.cpp | 36 +++ 7 files changed, 253 insertions(+), 116 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp index 7bc06cf6..82cc29e9 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp @@ -322,6 +322,16 @@ GeometryDataST CspElement::toDeviceGeometryData() const geometry_data.setQuadrilateral_Flat(heliostat); } + if (aperture_type == ApertureType::CIRCLE) + { + float r = circ->get_radius(); + float3 o = OptixCSP::toFloat3(m_origin); + float3 n = normalize(OptixCSP::toFloat3(m_aim_point - m_origin)); + ApertureCircle circ = static_cast(*m_aperture); + GeometryDataST::Circle_Flat heliostat(o, n, r); + geometry_data.setCircle_Flat(heliostat); + } + geometry_data.id = this->m_id; return geometry_data; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp index 944dc5b2..648dbd74 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp @@ -41,9 +41,22 @@ void GeometryManager::collect_geometry_info(const std::vectorget_aperture_type() == ApertureType::CIRCLE) - // { - // } + if (element->get_aperture_type() == ApertureType::CIRCLE) + { + if (element->get_surface_type() == SurfaceType::FLAT) + { + sbt_offset = static_cast(OpticalEntityType::CIRCLE_FLAT); + } + else + { + std::stringstream ss; + ss << "Unimplemented surface type (" + << static_cast(element->get_surface_type()) + << ") for circular aperture (" + << static_cast(element->get_aperture_type()); + throw std::runtime_error(ss.str()); + } + } if (element->get_aperture_type() == ApertureType::RECTANGLE) { diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h index 5abd5009..57d28f4d 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h @@ -21,7 +21,8 @@ namespace OptixCSP { UNKNOWN_TYPE = 3, RECTANGLE_FLAT = 4, TRIANGLE_FLAT = 5, - QUADRILATERAL_FLAT = 6 + QUADRILATERAL_FLAT = 6, + CIRCLE_FLAT = 7 }; struct Parallelogram @@ -137,6 +138,19 @@ namespace OptixCSP { float3 normal; // Positive direction follows right-hand rule }; + struct Circle_Flat{ + Circle_Flat() = default; + // Circle_Flat(const float radius) : r(radius) {} + Circle_Flat(const float3 &origin, const float3 &normal, const float &radius) + : r(radius), center(origin) + { + plane = make_float4(normalize(normal), dot(center, normal)); + } + float4 plane; + float3 center; + float r; + }; + GeometryDataST() = default; void setParallelogram(const Parallelogram& p) @@ -217,6 +231,18 @@ namespace OptixCSP { return quadrilateral_flat; } + void setCircle_Flat(const Circle_Flat &c) + { + assert(type == UNKNOWN_TYPE); + type = CIRCLE_FLAT; + circle_flat = c; + } + + __host__ __device__ const Circle_Flat& getCircle_Flat() const + { + assert(type == CIRCLE_FLAT); + return circle_flat; + } Type type = UNKNOWN_TYPE; @@ -231,6 +257,7 @@ namespace OptixCSP { Rectangle_Flat rectangle_flat; Triangle_Flat triangle_flat; Quadrilateral_Flat quadrilateral_flat; + Circle_Flat circle_flat; }; }; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h index 54944317..cc4af199 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h @@ -31,6 +31,7 @@ namespace OptixCSP{ CYLINDRICAL = 2, TRIANGLE_FLAT = 3, QUADRILATERAL_FLAT = 4, + CIRCLE_FLAT = 5, NUM_OPTICAL_ENTITY_TYPES }; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu index d3540db3..2e39d26d 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu @@ -1,46 +1,52 @@ #include -//#include +// #include #include "Soltrace.h" #include #include "GeometryDataST.h" -extern "C" { +extern "C" +{ __constant__ OptixCSP::LaunchParams params; } +extern "C" __device__ __inline__ float ray_distance_to_plane(float3 ro, float3 rd, float4 plane) +{ + const float3 n = make_float3(plane) return (plane.w - dot(n, ro)) / dot(rd, n); +} extern "C" __global__ void __intersection__parallelogram() { - int i = optixGetPrimitiveIndex(); - const OptixCSP::GeometryDataST::Parallelogram& parallelogram = params.geometry_data_array[i].getParallelogram(); - + int i = optixGetPrimitiveIndex(); + const OptixCSP::GeometryDataST::Parallelogram ¶llelogram = params.geometry_data_array[i].getParallelogram(); + // Get ray information: origin, direction, and min/max distances over which ray should be tested const float3 ray_orig = optixGetWorldRayOrigin(); - const float3 ray_dir = optixGetWorldRayDirection(); - const float ray_tmin = optixGetRayTmin(), ray_tmax = optixGetRayTmax(); + const float3 ray_dir = optixGetWorldRayDirection(); + const float ray_tmin = optixGetRayTmin(), ray_tmax = optixGetRayTmax(); - // Compute ray intersection point - float3 n = make_float3( parallelogram.plane ); - float dt = dot( ray_dir, n ); - // Compute distance t (point of intersection) along ray direction from ray origin - float t = ( parallelogram.plane.w - dot( n, ray_orig ) ) / dt; + // // Compute ray intersection point + // float3 n = make_float3( parallelogram.plane ); + // float dt = dot( ray_dir, n ); + // // Compute distance t (point of intersection) along ray direction from ray origin + // float t = ( parallelogram.plane.w - dot( n, ray_orig ) ) / dt; + float t = ray_distance_to_plane(ray_orig, ray_dir, paralllelogram.plane); // Verify intersection distance and Report ray intersection point - if( t > ray_tmin && t < ray_tmax ) + if (t > ray_tmin && t < ray_tmax) { - float3 p = ray_orig + ray_dir * t; + float3 p = ray_orig + ray_dir * t; float3 vi = p - parallelogram.anchor; - float a1 = dot( parallelogram.v1, vi ); - if( a1 >= 0 && a1 <= 1 ) + float a1 = dot(parallelogram.v1, vi); + if (a1 >= 0 && a1 <= 1) { - float a2 = dot( parallelogram.v2, vi ); - if( a2 >= 0 && a2 <= 1 ) + float a2 = dot(parallelogram.v2, vi); + if (a2 >= 0 && a2 <= 1) { - optixReportIntersection( t, - 0, - __float_as_uint( n.x ), - __float_as_uint( n.y ), - __float_as_uint( n.z )); + optixReportIntersection(t, + 0, + __float_as_uint(n.x), + __float_as_uint(n.y), + __float_as_uint(n.z)); } } } @@ -49,61 +55,62 @@ extern "C" __global__ void __intersection__parallelogram() extern "C" __global__ void __intersection__rectangle_flat() { - const OptixCSP::GeometryDataST::Rectangle_Flat& rectangle = params.geometry_data_array[optixGetPrimitiveIndex()].getRectangle_Flat(); - + const OptixCSP::GeometryDataST::Rectangle_Flat &rectangle = params.geometry_data_array[optixGetPrimitiveIndex()].getRectangle_Flat(); + const float3 ray_orig = optixGetWorldRayOrigin(); const float3 ray_dir = optixGetWorldRayDirection(); const float ray_tmin = optixGetRayTmin(); const float ray_tmax = optixGetRayTmax(); - // Get plane normal and distance - float3 n = make_float3(rectangle.plane); - float dt = dot(ray_dir, n); - - // Compute distance t (point of intersection) along ray direction from ray origin - float t = (rectangle.plane.w - dot(n, ray_orig)) / dt; + // // Get plane normal and distance + // float3 n = make_float3(rectangle.plane); + // float dt = dot(ray_dir, n); + + // // Compute distance t (point of intersection) along ray direction from ray origin + // float t = (rectangle.plane.w - dot(n, ray_orig)) / dt; + float t = ray_distance_to_plane(ray_orig, ray_dir, rectangle.plane); // Verify intersection distance if (t > ray_tmin && t < ray_tmax) { // Compute intersection point float3 p = ray_orig + ray_dir * t; - + // Compute vector from center to intersection point float3 v = p - rectangle.center; - + // Project onto x and y to get local coordinates float x = dot(rectangle.x, v); float y = dot(rectangle.y, v); - + // Check if point is within rectangle bounds - if (x >= -rectangle.width/2 && x <= rectangle.width/2 && - y >= -rectangle.height/2 && y <= rectangle.height/2) + if (x >= -rectangle.width / 2 && x <= rectangle.width / 2 && + y >= -rectangle.height / 2 && y <= rectangle.height / 2) { optixReportIntersection(t, - 0, - __float_as_uint(n.x), - __float_as_uint(n.y), - __float_as_uint(n.z)); + 0, + __float_as_uint(n.x), + __float_as_uint(n.y), + __float_as_uint(n.z)); } } } extern "C" __global__ void __intersection__cylinder_y() { - const OptixCSP::GeometryDataST::Cylinder_Y& cyl = params.geometry_data_array[optixGetPrimitiveIndex()].getCylinder_Y(); + const OptixCSP::GeometryDataST::Cylinder_Y &cyl = params.geometry_data_array[optixGetPrimitiveIndex()].getCylinder_Y(); // Get ray information: origin, direction, and min/max distances over which ray should be tested const float3 ray_orig = optixGetWorldRayOrigin(); const float3 ray_dir = normalize(optixGetWorldRayDirection()); - const float ray_tmin = optixGetRayTmin(); - const float ray_tmax = optixGetRayTmax(); + const float ray_tmin = optixGetRayTmin(); + const float ray_tmax = optixGetRayTmax(); // Transform ray to the cylinder's local coordinate system float3 local_ray_orig = ray_orig - cyl.center; float3 local_ray_dir = ray_dir; - // TODO: check how to optimize this, there should be a way in optix to rotate coordinates + // TODO: check how to optimize this, there should be a way in optix to rotate coordinates float3 local_x = cyl.base_x; float3 local_z = cyl.base_z; float3 local_y = cross(local_z, local_x); @@ -111,15 +118,13 @@ extern "C" __global__ void __intersection__cylinder_y() local_ray_orig = make_float3( dot(local_ray_orig, local_x), dot(local_ray_orig, local_y), - dot(local_ray_orig, local_z) - ); + dot(local_ray_orig, local_z)); local_ray_dir = make_float3( dot(local_ray_dir, local_x), dot(local_ray_dir, local_y), - dot(local_ray_dir, local_z) - ); + dot(local_ray_dir, local_z)); - // solve quadratic equation for intersection + // solve quadratic equation for intersection float A = local_ray_dir.x * local_ray_dir.x + local_ray_dir.z * local_ray_dir.z; float B = 2.0f * (local_ray_orig.x * local_ray_dir.x + local_ray_orig.z * local_ray_dir.z); float C = local_ray_orig.x * local_ray_orig.x + local_ray_orig.z * local_ray_orig.z - cyl.radius * cyl.radius; @@ -169,23 +174,23 @@ extern "C" __global__ void __intersection__cylinder_y() // Report intersection to OptiX optixReportIntersection(t, - 0, - __float_as_uint(world_normal.x), - __float_as_uint(world_normal.y), - __float_as_uint(world_normal.z)); + 0, + __float_as_uint(world_normal.x), + __float_as_uint(world_normal.y), + __float_as_uint(world_normal.z)); } -// ray cylinder intersection with top and bottom caps -// it can also be modeled as cylinder with two disks. +// ray cylinder intersection with top and bottom caps +// it can also be modeled as cylinder with two disks. extern "C" __global__ void __intersection__cylinder_y_capped() { - const OptixCSP::GeometryDataST::Cylinder_Y& cyl = params.geometry_data_array[optixGetPrimitiveIndex()].getCylinder_Y(); + const OptixCSP::GeometryDataST::Cylinder_Y &cyl = params.geometry_data_array[optixGetPrimitiveIndex()].getCylinder_Y(); // Get ray information: origin, direction, and min/max distances over which ray should be tested const float3 ray_orig = optixGetWorldRayOrigin(); const float3 ray_dir = normalize(optixGetWorldRayDirection()); - const float ray_tmin = optixGetRayTmin(); - const float ray_tmax = optixGetRayTmax(); + const float ray_tmin = optixGetRayTmin(); + const float ray_tmax = optixGetRayTmax(); // Transform ray to the cylinder's local coordinate system float3 local_ray_orig = ray_orig - cyl.center; @@ -199,13 +204,11 @@ extern "C" __global__ void __intersection__cylinder_y_capped() local_ray_orig = make_float3( dot(local_ray_orig, local_x), dot(local_ray_orig, local_y), - dot(local_ray_orig, local_z) - ); + dot(local_ray_orig, local_z)); local_ray_dir = make_float3( dot(local_ray_dir, local_x), dot(local_ray_dir, local_y), - dot(local_ray_dir, local_z) - ); + dot(local_ray_dir, local_z)); // Solve quadratic equation for intersection with curved surface float A = local_ray_dir.x * local_ray_dir.x + local_ray_dir.z * local_ray_dir.z; @@ -240,7 +243,7 @@ extern "C" __global__ void __intersection__cylinder_y_capped() { float t = (-cyl.half_height - local_ray_orig.y) / local_ray_dir.y; float2 hit_point = make_float2(local_ray_orig.x + t * local_ray_dir.x, - local_ray_orig.z + t * local_ray_dir.z); + local_ray_orig.z + t * local_ray_dir.z); if (t > ray_tmin && t < ray_tmax && dot(hit_point, hit_point) <= cyl.radius * cyl.radius) { t_caps = t; @@ -252,7 +255,7 @@ extern "C" __global__ void __intersection__cylinder_y_capped() { float t = (cyl.half_height - local_ray_orig.y) / local_ray_dir.y; float2 hit_point = make_float2(local_ray_orig.x + t * local_ray_dir.x, - local_ray_orig.z + t * local_ray_dir.z); + local_ray_orig.z + t * local_ray_dir.z); if (t > ray_tmin && t < ray_tmax && dot(hit_point, hit_point) <= cyl.radius * cyl.radius) { t_caps = fminf(t_caps, t); @@ -294,12 +297,10 @@ extern "C" __global__ void __intersection__cylinder_y_capped() 0, // User-defined instance ID or custom data __float_as_uint(world_normal.x), __float_as_uint(world_normal.y), - __float_as_uint(world_normal.z) - ); + __float_as_uint(world_normal.z)); } - -// For a parabolic surface rectangle aperture where +// For a parabolic surface rectangle aperture where // the base (normal projection) is defined by the center and its two unit edge vectors // In a local coordinate system (with origin at the anchor) the flat rectangle covers: // x in [0, L1] and y in [0, L2], @@ -320,12 +321,12 @@ extern "C" __global__ void __intersection__cylinder_y_capped() // The local hit point is then transformed back to world space for reporting. extern "C" __global__ void __intersection__rectangle_parabolic() { - const OptixCSP::GeometryDataST::Rectangle_Parabolic& rect = params.geometry_data_array[optixGetPrimitiveIndex()].getRectangleParabolic(); + const OptixCSP::GeometryDataST::Rectangle_Parabolic &rect = params.geometry_data_array[optixGetPrimitiveIndex()].getRectangleParabolic(); // Get ray information. const float3 ray_orig = optixGetWorldRayOrigin(); const float3 ray_dir = optixGetWorldRayDirection(); - const float ray_tmin = optixGetRayTmin(); - const float ray_tmax = optixGetRayTmax(); + const float ray_tmin = optixGetRayTmin(); + const float ray_tmax = optixGetRayTmax(); // // Build the local coordinate system. @@ -375,23 +376,28 @@ extern "C" __global__ void __intersection__rectangle_parabolic() const float eps = 1e-12f; bool valid = false; - if (fabsf(A) < eps) { + if (fabsf(A) < eps) + { // Degenerate (linear) case. t = -C / B; valid = (t > 0.0f); } - else { + else + { float discr = B * B - 4.0f * A * C; - if (discr >= 0.0f) { + if (discr >= 0.0f) + { float sqrt_discr = sqrtf(discr); float t1 = (-B - sqrt_discr) / (2.0f * A); float t2 = (-B + sqrt_discr) / (2.0f * A); // Choose the smallest positive t. - if (t1 > 0.0f && t1 < t2) { + if (t1 > 0.0f && t1 < t2) + { t = t1; valid = true; } - else if (t2 > 0.0f) { + else if (t2 > 0.0f) + { t = t2; valid = true; } @@ -399,7 +405,8 @@ extern "C" __global__ void __intersection__rectangle_parabolic() } // Discard if no valid t or if t is not within the ray's bounds. - if (!valid || t < ray_tmin || t > ray_tmax) { + if (!valid || t < ray_tmin || t > ray_tmax) + { return; } @@ -415,9 +422,10 @@ extern "C" __global__ void __intersection__rectangle_parabolic() // The parametric coordinates are: // a1 = x_hit / (L1/2) and a2 = y_hit / (L2/2) // - float a1 = x_hit / (L1/2.); - float a2 = y_hit / (L2/2.); - if (a1 < -1.0f || a1 > 1.0f || a2 < -1.0f || a2 > 1.0f) { + float a1 = x_hit / (L1 / 2.); + float a2 = y_hit / (L2 / 2.); + if (a1 < -1.0f || a1 > 1.0f || a2 < -1.0f || a2 > 1.0f) + { return; } @@ -431,12 +439,12 @@ extern "C" __global__ void __intersection__rectangle_parabolic() // N_local = (-f_x, -f_y, 1) = ( -curv_x*x_hit, -curv_y*y_hit, 1 ). // float3 N_local = normalize(make_float3(-curv_x * x_hit, - -curv_y * y_hit, - 1.0f)); + -curv_y * y_hit, + 1.0f)); // Transform the normal back to world coordinates. float3 world_normal = normalize(N_local.x * e1 + - N_local.y * e2 + - N_local.z * n); + N_local.y * e2 + + N_local.z * n); // Compute the hit point in world space. float3 world_hit = ray_orig + t * ray_dir; @@ -445,19 +453,19 @@ extern "C" __global__ void __intersection__rectangle_parabolic() // Here, the two reported extra attributes are the parametric coordinates (a1, a2), // encoded as unsigned integers. optixReportIntersection(t, 0, - __float_as_uint(world_normal.x), - __float_as_uint(world_normal.y), - __float_as_uint(world_normal.z)); + __float_as_uint(world_normal.x), + __float_as_uint(world_normal.y), + __float_as_uint(world_normal.z)); } // intersection algorithm for a flat triangle based on "Fast, Minimum Storage Ray/Triangle Intersection" by M�ller and Trumbore (1997) -// code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm +// code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm extern "C" __device__ __inline__ float _triangle_intersect( float3 p0, float3 edge1, float3 edge2, float3 ro, float3 rd) { const float3 pvec = cross(rd, edge2); - const float det = dot(edge1, pvec); + const float det = dot(edge1, pvec); // // Backface culling + parallel rejection // // (det must be strictly positive and not tiny) @@ -467,26 +475,28 @@ extern "C" __device__ __inline__ float _triangle_intersect( // Parallel rejection // (det must be not tiny) const float eps = 1e-8f; - if (fabs(det) <= eps) return -1.0f; - + if (fabs(det) <= eps) + return -1.0f; const float inv_det = 1.0f / det; const float3 tvec = ro - p0; - const float u = dot(tvec, pvec) * inv_det; - if (u < 0.0f || u > 1.0f) return -1.0f; + const float u = dot(tvec, pvec) * inv_det; + if (u < 0.0f || u > 1.0f) + return -1.0f; const float3 qvec = cross(tvec, edge1); - const float v = dot(rd, qvec) * inv_det; - if (v < 0.0f || (u + v) > 1.0f) return -1.0f; + const float v = dot(rd, qvec) * inv_det; + if (v < 0.0f || (u + v) > 1.0f) + return -1.0f; - const float t = dot(edge2, qvec) * inv_det; + const float t = dot(edge2, qvec) * inv_det; return t; } // // intersection algorithm for a flat triangle based on "Fast, Minimum Storage Ray/Triangle Intersection" by M�ller and Trumbore (1997) -// // code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm +// // code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm // extern "C" __global__ void __intersection__triangle_flat() // { // const OptixCSP::GeometryDataST::Triangle_Flat& tri = params.geometry_data_array[optixGetPrimitiveIndex()].getTriangle_Flat(); @@ -499,7 +509,6 @@ extern "C" __device__ __inline__ float _triangle_intersect( // const float3 edge1 = tri.e1; // const float3 edge2 = tri.e2; - // const float3 pvec = cross(rd, edge2); // const float det = dot(edge1, pvec); @@ -516,7 +525,7 @@ extern "C" __device__ __inline__ float _triangle_intersect( // const float3 qvec = cross(tvec, edge1); // const float v = dot(rd, qvec) * inv_det; -// if (v < 0.0f || (u + v) > 1.0f) +// if (v < 0.0f || (u + v) > 1.0f) // return; // const float t = dot(edge2, qvec) * inv_det; @@ -532,31 +541,30 @@ extern "C" __device__ __inline__ float _triangle_intersect( // } // intersection algorithm for a flat triangle based on "Fast, Minimum Storage Ray/Triangle Intersection" by M�ller and Trumbore (1997) -// code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm +// code from here: https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm extern "C" __global__ void __intersection__triangle_flat() { - const OptixCSP::GeometryDataST::Triangle_Flat& tri = params.geometry_data_array[optixGetPrimitiveIndex()].getTriangle_Flat(); + const OptixCSP::GeometryDataST::Triangle_Flat &tri = params.geometry_data_array[optixGetPrimitiveIndex()].getTriangle_Flat(); const float3 ro = optixGetObjectRayOrigin(); const float3 rd = optixGetObjectRayDirection(); const float t = _triangle_intersect(tri.v0, tri.e1, tri.e2, ro, rd); - if (t < optixGetRayTmin() || t > optixGetRayTmax()) return; + if (t < optixGetRayTmin() || t > optixGetRayTmax()) + return; float3 world_normal = tri.normal; optixReportIntersection(t, 0, - __float_as_uint(world_normal.x), - __float_as_uint(world_normal.y), - __float_as_uint(world_normal.z)); - + __float_as_uint(world_normal.x), + __float_as_uint(world_normal.y), + __float_as_uint(world_normal.z)); } - extern "C" __global__ void __intersection__quadrilateral_flat() { - const OptixCSP::GeometryDataST::Quadrilateral_Flat& quad = params.geometry_data_array[optixGetPrimitiveIndex()].getQuadrilateral_Flat(); + const OptixCSP::GeometryDataST::Quadrilateral_Flat &quad = params.geometry_data_array[optixGetPrimitiveIndex()].getQuadrilateral_Flat(); const float3 ro = optixGetObjectRayOrigin(); const float3 rd = optixGetObjectRayDirection(); @@ -575,12 +583,45 @@ extern "C" __global__ void __intersection__quadrilateral_flat() t = _triangle_intersect(p2, e1, e2, ro, rd); } - if (t < optixGetRayTmin() || t > optixGetRayTmax()) return; + if (t < optixGetRayTmin() || t > optixGetRayTmax()) + return; float3 world_normal = quad.normal; optixReportIntersection(t, 0, - __float_as_uint(world_normal.x), - __float_as_uint(world_normal.y), - __float_as_uint(world_normal.z)); + __float_as_uint(world_normal.x), + __float_as_uint(world_normal.y), + __float_as_uint(world_normal.z)); +} + +extern "C" __global__ void __intersection__circle_flat() +{ + const OptixCSP::GeometryDataST::Circle_Flat &circ = params.geometry_data_array[optixGetPrimitiveIndex()].getCircle_Flat(); + + // Get ray information: origin, direction, and min/max distances over which ray should be tested + const float3 ray_orig = optixGetWorldRayOrigin(); + const float3 ray_dir = optixGetWorldRayDirection(); + const float ray_tmin = optixGetRayTmin(), ray_tmax = optixGetRayTmax(); + + // // Compute ray intersection point + // float3 n = make_float3( circ.plane ); + // float dt = dot( ray_dir, n ); + // // Compute distance t (point of intersection) along ray direction from ray origin + // float t = ( circ.plane.w - dot( n, ray_orig ) ) / dt; + float t = ray_distance_to_plane(ray_orig, ray_dir, circ.plane); + + // Verify intersection distance and Report ray intersection point + if (t > ray_tmin && t < ray_tmax) + { + float3 p = ray_orig + ray_dir * t; + float d = length(p - circ.center); + if (d <= circ.r) + { + optixReportIntersection(t, + 0, + __float_as_uint(n.x), + __float_as_uint(n.y), + __float_as_uint(n.z)); + } + } } diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index 156127d7..be1ef9a9 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -232,6 +232,15 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) break; } + case ApertureType::CIRCLE: + { + auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); + assert(el_aperture != nullptr); + auto aperture = std::make_shared(0.5 * el_aperture->diameter); + optix_el->set_aperture(aperture); + break; + } + case ApertureType::EQUILATERAL_TRIANGLE: { auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp index 1efd1ca3..0799cc38 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp @@ -268,3 +268,39 @@ TEST(OptixRunner, Cylinder) EXPECT_NEAR(p1[2], z1, TOL * Z_ELEM) << "ray " << i; } } + +TEST(OptixRunner, FlatCircle) +{ + const double R = 5.0; + auto surf = make_surface(); + auto aper = make_aperture(2*R); + + SimulationData sd; + set_default_sd(sd, surf, aper); + SimulationResult result; + + OptixRunner runner; + RunnerStatus sts = runner.initialize(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.setup_simulation(&sd); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.run_simulation(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.report_simulation(&result, 0); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + + ASSERT_EQ(result.get_number_of_records(), + sd.get_simulation_parameters().number_of_rays); + for (int i = 0; i < (int)result.get_number_of_records(); ++i) + { + auto rr = result[i]; + ASSERT_GE(rr->get_number_of_interactions(), 2); + Vector3d p0, p1; + rr->get_position(0, p0); + rr->get_position(1, p1); + EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; + EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; + EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + EXPECT_LE(sqrt(p1[0]*p1[0] + p1[1]*p1[1]), R); + } +} From fa3b64c798effed0da659aa9fe7ffa59ed9a26bf Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Fri, 27 Mar 2026 16:19:10 -0600 Subject: [PATCH 02/60] Add intersection function to pipeline manager map --- .../optix_runner/OptixCSP/src/core/pipeline_manager.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp index 6d005191..0d0af819 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp @@ -42,7 +42,8 @@ const std::map IntersectionKernelMap = { {OpticalEntityType::RECTANGLE_FLAT, "__intersection__rectangle_flat"}, {OpticalEntityType::TRIANGLE_FLAT, "__intersection__triangle_flat"}, {OpticalEntityType::CYLINDRICAL, "__intersection__cylinder_y"}, - {OpticalEntityType::QUADRILATERAL_FLAT, "__intersection__quadrilateral_flat"}}; + {OpticalEntityType::QUADRILATERAL_FLAT, "__intersection__quadrilateral_flat"}, + {OpticalEntityType::CIRCLE_FLAT, "__intersection__circle_flat"}}; pipelineManager::pipelineManager(SoltraceState &state) : m_state(state) {} From 1aab1e1b0ee673576ae05c2c0da97369d30f40ad Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Mon, 30 Mar 2026 09:20:29 -0600 Subject: [PATCH 03/60] Compile fixes --- .../optix_runner/OptixCSP/src/core/CspElement.cpp | 4 ++-- .../optix_runner/OptixCSP/src/shaders/intersection.cu | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp index 82cc29e9..d122b949 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp @@ -324,10 +324,10 @@ GeometryDataST CspElement::toDeviceGeometryData() const if (aperture_type == ApertureType::CIRCLE) { - float r = circ->get_radius(); + ApertureCircle circ = static_cast(*m_aperture); + float r = circ.get_radius(); float3 o = OptixCSP::toFloat3(m_origin); float3 n = normalize(OptixCSP::toFloat3(m_aim_point - m_origin)); - ApertureCircle circ = static_cast(*m_aperture); GeometryDataST::Circle_Flat heliostat(o, n, r); geometry_data.setCircle_Flat(heliostat); } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu index 2e39d26d..720acc55 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu @@ -11,7 +11,8 @@ extern "C" extern "C" __device__ __inline__ float ray_distance_to_plane(float3 ro, float3 rd, float4 plane) { - const float3 n = make_float3(plane) return (plane.w - dot(n, ro)) / dot(rd, n); + const float3 n = make_float3(plane); + return (plane.w - dot(n, ro)) / dot(rd, n); } extern "C" __global__ void __intersection__parallelogram() @@ -29,7 +30,8 @@ extern "C" __global__ void __intersection__parallelogram() // float dt = dot( ray_dir, n ); // // Compute distance t (point of intersection) along ray direction from ray origin // float t = ( parallelogram.plane.w - dot( n, ray_orig ) ) / dt; - float t = ray_distance_to_plane(ray_orig, ray_dir, paralllelogram.plane); + float t = ray_distance_to_plane(ray_orig, ray_dir, parallelogram.plane); + const float4 n = parallelogram.plane; // Verify intersection distance and Report ray intersection point if (t > ray_tmin && t < ray_tmax) @@ -69,6 +71,7 @@ extern "C" __global__ void __intersection__rectangle_flat() // // Compute distance t (point of intersection) along ray direction from ray origin // float t = (rectangle.plane.w - dot(n, ray_orig)) / dt; float t = ray_distance_to_plane(ray_orig, ray_dir, rectangle.plane); + const float4 n = rectangle.plane; // Verify intersection distance if (t > ray_tmin && t < ray_tmax) @@ -609,6 +612,7 @@ extern "C" __global__ void __intersection__circle_flat() // // Compute distance t (point of intersection) along ray direction from ray origin // float t = ( circ.plane.w - dot( n, ray_orig ) ) / dt; float t = ray_distance_to_plane(ray_orig, ray_dir, circ.plane); + const float4 n = circ.plane; // Verify intersection distance and Report ray intersection point if (t > ray_tmin && t < ray_tmax) From 575810843fec53497c33b371785d787152cd41e4 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Mon, 30 Mar 2026 14:39:52 -0600 Subject: [PATCH 04/60] Initial run at implementation of flat hexagon for optix runner --- .../OptixCSP/src/core/Aperture.cpp | 20 ++++-- .../optix_runner/OptixCSP/src/core/Aperture.h | 18 +++++- .../OptixCSP/src/core/CspElement.cpp | 27 ++++++-- .../OptixCSP/src/core/geometry_manager.cpp | 17 +++++ .../OptixCSP/src/core/pipeline_manager.cpp | 3 +- .../OptixCSP/src/core/soltrace_type.h | 3 +- .../OptixCSP/src/shaders/GeometryDataST.h | 26 ++++++++ .../OptixCSP/src/shaders/Soltrace.h | 1 + .../OptixCSP/src/shaders/intersection.cu | 63 ++++++++++++++++++- .../optix_runner/optix_runner.cpp | 16 ++--- 10 files changed, 171 insertions(+), 23 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp index a9ae5315..ebf3e3c9 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp @@ -28,12 +28,24 @@ double ApertureCircle::get_radius() const { return radius; } -double ApertureCircle::get_width() const { - return 2.0 * radius; +// double ApertureCircle::get_width() const { +// return 2.0 * radius; +// } + +// double ApertureCircle::get_height() const { +// return 2.0 * radius; +// } + +// ApertureHexagon implementations +ApertureHexagon::ApertureHexagon() : side_length(1.0) {} +ApertureHexagon::ApertureHexagon(double s) : side_length(s) {} + +ApertureType ApertureHexagon::get_aperture_type() const { + return ApertureType::HEXAGON; } -double ApertureCircle::get_height() const { - return 2.0 * radius; +double ApertureHexagon::get_side_length() const { + return side_length; } // ApertureRectangleEasy implementations diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h index 0ef3740f..a2bf4385 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h @@ -41,13 +41,27 @@ namespace OptixCSP { virtual ApertureType get_aperture_type() const override; void set_size(double r); virtual double get_radius() const override; - virtual double get_width() const override; - virtual double get_height() const override; + // virtual double get_width() const override; + // virtual double get_height() const override; private: double radius; }; + // Concrete class for a circular aperture. + class ApertureHexagon : public Aperture { + public: + ApertureHexagon(); + ApertureHexagon(double s); + virtual ~ApertureHexagon() = default; + + virtual ApertureType get_aperture_type() const override; + virtual double get_side_length() const; + + private: + double side_length; + }; + // Concrete class for an easy rectangular aperture. class ApertureRectangle : public Aperture { public: diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp index d122b949..b71de1f6 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp @@ -266,13 +266,10 @@ GeometryDataST CspElement::toDeviceGeometryData() const float3 center = OptixCSP::toFloat3(m_origin); Matrix33d rotation_matrix = get_rotation_matrix(); // L2G rotation matrix - float3 base_x = OptixCSP::toFloat3(rotation_matrix.get_x_basis()); - float3 base_z = OptixCSP::toFloat3(rotation_matrix.get_z_basis()); GeometryDataST::Cylinder_Y heliostat(center, radius, half_height, base_x, base_z); - geometry_data.setCylinder_Y(heliostat); } } @@ -293,7 +290,9 @@ GeometryDataST CspElement::toDeviceGeometryData() const Vec3d v2_global = rotation_matrix * v2 + m_origin; Vec3d v3_global = rotation_matrix * v3 + m_origin; - GeometryDataST::Triangle_Flat heliostat(OptixCSP::toFloat3(v1_global), OptixCSP::toFloat3(v2_global), OptixCSP::toFloat3(v3_global)); + GeometryDataST::Triangle_Flat heliostat(OptixCSP::toFloat3(v1_global), + OptixCSP::toFloat3(v2_global), + OptixCSP::toFloat3(v3_global)); geometry_data.setTriangle_Flat(heliostat); } @@ -328,8 +327,24 @@ GeometryDataST CspElement::toDeviceGeometryData() const float r = circ.get_radius(); float3 o = OptixCSP::toFloat3(m_origin); float3 n = normalize(OptixCSP::toFloat3(m_aim_point - m_origin)); - GeometryDataST::Circle_Flat heliostat(o, n, r); - geometry_data.setCircle_Flat(heliostat); + if (surface_type == SurfaceType::FLAT) + { + GeometryDataST::Circle_Flat heliostat(o, n, r); + geometry_data.setCircle_Flat(heliostat); + } + } + + if (aperture_type == ApertureType::HEXAGON) + { + ApertureHexagon hex = static_cast(*m_aperture); + float s = hex.get_side_length(); + float3 o = OptixCSP::toFloat3(m_origin); + float3 n = normalize(OptixCSP::toFloat3(m_aim_point - m_origin)); + if (surface_type == SurfaceType::FLAT) + { + GeometryDataST::Hexagon_Flat hex(o, n, s); + geometry_data.setHexagon_Flat(hex); + } } geometry_data.id = this->m_id; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp index 648dbd74..bd2e51c0 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp @@ -58,6 +58,23 @@ void GeometryManager::collect_geometry_info(const std::vectorget_aperture_type() == ApertureType::HEXAGON) + { + if (element->get_surface_type() == SurfaceType::FLAT) + { + sbt_offset = static_cast(OpticalEntityType::HEXAGON_FLAT); + } + else + { + std::stringstream ss; + ss << "Unimplemented surface type (" + << static_cast(element->get_surface_type()) + << ") for hexagon aperture (" + << static_cast(element->get_aperture_type()); + throw std::runtime_error(ss.str()); + } + } + if (element->get_aperture_type() == ApertureType::RECTANGLE) { if (element->get_surface_type() == SurfaceType::PARABOLIC) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp index 0d0af819..571d706b 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp @@ -43,7 +43,8 @@ const std::map IntersectionKernelMap = { {OpticalEntityType::TRIANGLE_FLAT, "__intersection__triangle_flat"}, {OpticalEntityType::CYLINDRICAL, "__intersection__cylinder_y"}, {OpticalEntityType::QUADRILATERAL_FLAT, "__intersection__quadrilateral_flat"}, - {OpticalEntityType::CIRCLE_FLAT, "__intersection__circle_flat"}}; + {OpticalEntityType::CIRCLE_FLAT, "__intersection__circle_flat"}, + {OpticalEntityType::HEXAGON_FLAT, "__intersection__hexagon_flat"}}; pipelineManager::pipelineManager(SoltraceState &state) : m_state(state) {} diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h index e10ad3cc..b133146e 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h @@ -10,7 +10,8 @@ namespace OptixCSP RECTANGLE, CIRCLE, TRIANGLE, - QUADRILATERAL + QUADRILATERAL, + HEXAGON }; // types for both scene building and pipeline assembly diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h index 57d28f4d..21c81b86 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h @@ -151,6 +151,18 @@ namespace OptixCSP { float r; }; + struct Hexagon_Flat{ + Hexagon_Flat() = default; + Hexagon_Flat(const float3 &origin, const float3 &normal, const float &side_length) + : s(side_length), center(origin) + { + plane = make_float4(normalize(normal), dot(center, normal)); + } + float4 plane; + float3 center; + float s; + }; + GeometryDataST() = default; void setParallelogram(const Parallelogram& p) @@ -244,6 +256,19 @@ namespace OptixCSP { return circle_flat; } + void setHexagon_Flat(const Hexagon_Flat &h) + { + assert(type == UNKNOWN_TYPE); + type = HEXAGON_FLAT; + hexagon_flat = h; + } + + __host__ __device__ const Hexagon_Flat& getHexagon_Flat() const + { + assert(type == HEXAGON_FLAT); + return hexagon_flat; + } + Type type = UNKNOWN_TYPE; int32_t id = OptixCSP::kElementIdUnassigned; @@ -258,6 +283,7 @@ namespace OptixCSP { Triangle_Flat triangle_flat; Quadrilateral_Flat quadrilateral_flat; Circle_Flat circle_flat; + Hexagon_Flat hexagon_flat; }; }; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h index cc4af199..bd334452 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h @@ -32,6 +32,7 @@ namespace OptixCSP{ TRIANGLE_FLAT = 3, QUADRILATERAL_FLAT = 4, CIRCLE_FLAT = 5, + HEXAGON_FLAT = 6, NUM_OPTICAL_ENTITY_TYPES }; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu index 720acc55..7ad879e1 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu @@ -11,8 +11,8 @@ extern "C" extern "C" __device__ __inline__ float ray_distance_to_plane(float3 ro, float3 rd, float4 plane) { - const float3 n = make_float3(plane); - return (plane.w - dot(n, ro)) / dot(rd, n); + const float3 n = make_float3(plane); + return (plane.w - dot(n, ro)) / dot(rd, n); } extern "C" __global__ void __intersection__parallelogram() @@ -629,3 +629,62 @@ extern "C" __global__ void __intersection__circle_flat() } } } + +extern "C" __global__ void __intersection__hexagon_flat() +{ + const OptixCSP::GeometryDataST::Hexagon_Flat &hex = params.geometry_data_array[optixGetPrimitiveIndex()].getHexagon_Flat(); + + // Get ray information: origin, direction, and min/max distances over which ray should be tested + const float3 ray_orig = optixGetWorldRayOrigin(); + const float3 ray_dir = optixGetWorldRayDirection(); + const float ray_tmin = optixGetRayTmin(), ray_tmax = optixGetRayTmax(); + + float t = ray_distance_to_plane(ray_orig, ray_dir, hex.plane); + const float4 n = hex.plane; + + // Verify intersection distance and Report ray intersection point + if (t > ray_tmin && t < ray_tmax) + { + bool is_in = false; + float3 p = ray_orig + ray_dir * t - hex.center; + // float d = length(p - circ.center); + float s = hex.s float xl = 0.5 * s; + float yl = 0.5f * sqrtf(3.0f) * s; + if (-xl <= p.x && p.x <= xl && -yl <= p.y && p.y <= yl) + { + // Center + is_in = true; + } + else if (-s <= p.x && p.x < xl) + { + // Left side + float y1 = 2.0f * yl * (p.x - s); + float y2 = -y1; + if (y1 <= p.y && p.y <= y2) + { + is_in = true; + } + } + else if (xl < p.x && p.x <= s) + { + // Right side + y1 = sqrt(3.0) * (x + ro); + y2 = -y1; + float y1 = 2.0f * yl * (p.x + s); + float y2 = -y1; + if (y2 <= p.y && p.y <= y1) + { + is_in = true; + } + } + + if (is_in) + { + optixReportIntersection(t, + 0, + __float_as_uint(n.x), + __float_as_uint(n.y), + __float_as_uint(n.z)); + } + } +} diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index be1ef9a9..cdef7d62 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -220,7 +220,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) switch (soltrace_aperture_type) { - case ApertureType::RECTANGLE: { @@ -231,7 +230,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) optix_el->set_aperture(aperture); break; } - case ApertureType::CIRCLE: { auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); @@ -240,7 +238,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) optix_el->set_aperture(aperture); break; } - case ApertureType::EQUILATERAL_TRIANGLE: { auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); @@ -256,7 +253,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) break; } - case ApertureType::IRREGULAR_TRIANGLE: { auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); @@ -271,7 +267,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) break; } - case ApertureType::IRREGULAR_QUADRILATERAL: { auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); @@ -287,11 +282,18 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) break; } + case ApertureType::HEXAGON: + { + auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); + assert(el_aperture != nullptr); + auto aperture = std::make_shared(el_aperture->radius_circumscribed_circle()); + optix_el->set_aperture(aperture); + } default: // std::cerr << "Unsupported aperture type in OptixCSP" << std::endl; - throw std::runtime_error("Unsupported aperture type in OptixRunner"); - break; + throw std::runtime_error("Unsupported aperture type in OptixRunner"); + break; } optix_el->update_euler_angles(); From 4fe91f9eb487651ecd6b6f31863ec5d9c079f09c Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Mon, 30 Mar 2026 14:49:36 -0600 Subject: [PATCH 05/60] Initial run at basic test for flat hexagon intersection in optix runner --- .../simulation_data/aperture_test.cpp | 2 +- .../geometry_intersection_test.cpp | 36 +++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/google-tests/unit-tests/simulation_data/aperture_test.cpp b/google-tests/unit-tests/simulation_data/aperture_test.cpp index 3ee7544e..ddf0f642 100644 --- a/google-tests/unit-tests/simulation_data/aperture_test.cpp +++ b/google-tests/unit-tests/simulation_data/aperture_test.cpp @@ -334,7 +334,7 @@ TEST(Aperture, Hexagon) const double TOL = 1e-12; const double D = 2.0; const double R = 0.5 * D; - const double S = sqrt(3.0) * R; // Side length of hexagon + // const double S = sqrt(3.0) * R; // Side length of hexagon const double AREA = 0.5 * sqrt(27.0) * R * R; const double X1 = 1.0; diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp index 0799cc38..b8ec68d9 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp @@ -304,3 +304,39 @@ TEST(OptixRunner, FlatCircle) EXPECT_LE(sqrt(p1[0]*p1[0] + p1[1]*p1[1]), R); } } + +TEST(OptixRunner, FlatHexagon) +{ + const double S = 5.0; + auto surf = make_surface(); + auto aper = make_aperture(2*S); + + SimulationData sd; + set_default_sd(sd, surf, aper); + SimulationResult result; + + OptixRunner runner; + RunnerStatus sts = runner.initialize(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.setup_simulation(&sd); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.run_simulation(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.report_simulation(&result, 0); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + + ASSERT_EQ(result.get_number_of_records(), + sd.get_simulation_parameters().number_of_rays); + for (int i = 0; i < (int)result.get_number_of_records(); ++i) + { + auto rr = result[i]; + ASSERT_GE(rr->get_number_of_interactions(), 2); + Vector3d p0, p1; + rr->get_position(0, p0); + rr->get_position(1, p1); + EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; + EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; + EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + EXPECT_TRUE(aper->is_in(p1[0], p1[1])); + } +} From a390dcd046223c6df742c2e704baa38a1adaf625 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Mon, 30 Mar 2026 16:04:04 -0600 Subject: [PATCH 06/60] Fixes for optix runner hexagon flat --- .../optix_runner/OptixCSP/src/shaders/GeometryDataST.h | 3 ++- .../optix_runner/OptixCSP/src/shaders/intersection.cu | 9 ++++----- .../simulation_runner/optix_runner/optix_runner.cpp | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h index 21c81b86..100f474b 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h @@ -22,7 +22,8 @@ namespace OptixCSP { RECTANGLE_FLAT = 4, TRIANGLE_FLAT = 5, QUADRILATERAL_FLAT = 6, - CIRCLE_FLAT = 7 + CIRCLE_FLAT = 7, + HEXAGON_FLAT = 8 }; struct Parallelogram diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu index 7ad879e1..ac0dddcc 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu @@ -648,7 +648,8 @@ extern "C" __global__ void __intersection__hexagon_flat() bool is_in = false; float3 p = ray_orig + ray_dir * t - hex.center; // float d = length(p - circ.center); - float s = hex.s float xl = 0.5 * s; + float s = hex.s; + float xl = 0.5f * s; float yl = 0.5f * sqrtf(3.0f) * s; if (-xl <= p.x && p.x <= xl && -yl <= p.y && p.y <= yl) { @@ -658,7 +659,7 @@ extern "C" __global__ void __intersection__hexagon_flat() else if (-s <= p.x && p.x < xl) { // Left side - float y1 = 2.0f * yl * (p.x - s); + float y1 = sqrtf(3.0f) * (p.x + s); float y2 = -y1; if (y1 <= p.y && p.y <= y2) { @@ -668,9 +669,7 @@ extern "C" __global__ void __intersection__hexagon_flat() else if (xl < p.x && p.x <= s) { // Right side - y1 = sqrt(3.0) * (x + ro); - y2 = -y1; - float y1 = 2.0f * yl * (p.x + s); + float y1 = sqrtf(3.0f) * (p.x - s); float y2 = -y1; if (y2 <= p.y && p.y <= y1) { diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index cdef7d62..5fe41160 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -286,9 +286,9 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) { auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); assert(el_aperture != nullptr); - auto aperture = std::make_shared(el_aperture->radius_circumscribed_circle()); optix_el->set_aperture(aperture); + break; } default: // std::cerr << "Unsupported aperture type in OptixCSP" << std::endl; From bbda4611e0994b732ed501b318c79d06c197911f Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 31 Mar 2026 14:14:50 -0600 Subject: [PATCH 07/60] Initial implementation of flat annulus for optix runner --- .../OptixCSP/src/core/Aperture.cpp | 8 ++ .../optix_runner/OptixCSP/src/core/Aperture.h | 13 ++ .../OptixCSP/src/core/CspElement.cpp | 14 ++ .../OptixCSP/src/core/geometry_manager.cpp | 17 +++ .../OptixCSP/src/core/pipeline_manager.cpp | 3 +- .../OptixCSP/src/core/soltrace_type.h | 3 +- .../OptixCSP/src/shaders/GeometryDataST.h | 127 +++++++++++------- .../OptixCSP/src/shaders/Soltrace.h | 1 + .../OptixCSP/src/shaders/intersection.cu | 36 ++++- .../optix_runner/optix_runner.cpp | 15 ++- 10 files changed, 177 insertions(+), 60 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp index ebf3e3c9..e0a51e3f 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp @@ -85,3 +85,11 @@ Vec3d ApertureQuadrilateral::get_p0() const { return m_p0; } Vec3d ApertureQuadrilateral::get_p1() const { return m_p1; } Vec3d ApertureQuadrilateral::get_p2() const { return m_p2; } Vec3d ApertureQuadrilateral::get_p3() const { return m_p3; } + +ApertureAnnulus::ApertureAnnulus() : ri(0.5), ro(1.0) {} +ApertureAnnulus::ApertureAnnulus(double r_inner, double r_outer) {} +ApertureType ApertureAnnulus::get_aperture_type() const { + return ApertureType::ANNULUS; +} +double ApertureAnnulus::get_radius_inner() const { return ri; } +double ApertureAnnulus::get_radius_outer() const { return ro; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h index a2bf4385..47c52bab 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h @@ -123,4 +123,17 @@ namespace OptixCSP { Vec3d m_p3; }; + class ApertureAnnulus : public Aperture { + public: + ApertureAnnulus(); + ApertureAnnulus(double ri, double ro); + virtual ~ApertureAnnulus() = default; + virtual ApertureType get_aperture_type() const override; + double get_radius_inner() const; + double get_radius_outer() const; + private: + double ri; + double ro; + }; + } \ No newline at end of file diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp index b71de1f6..c5b98270 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp @@ -347,6 +347,20 @@ GeometryDataST CspElement::toDeviceGeometryData() const } } + if (aperture_type == ApertureType::ANNULUS) + { + ApertureAnnulus anf = static_cast(*m_aperture); + float radius_in = anf.get_radius_inner(); + float radius_out = anf.get_radius_outer(); + float3 o = OptixCSP::toFloat3(m_origin); + float3 n = normalize(OptixCSP::toFloat3(m_aim_point - m_origin)); + if (surface_type == SurfaceFlat::FLAT) + { + GeometryDataST::Annulus_Flat anf(o, n, radius_in, radius_out); + geometry_data.setAnnulus_Flat(anf); + } + } + geometry_data.id = this->m_id; return geometry_data; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp index bd2e51c0..70de9a55 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp @@ -41,6 +41,23 @@ void GeometryManager::collect_geometry_info(const std::vectorget_aperture_type() == ApertureType::ANNULUS) + { + if (element->get_surface_type() == SurfaceType::FLAT) + { + sbt_offset = static_cast(OpticalEntityType::ANNULUS_FLAT); + } + else + { + std::stringstream ss; + ss << "Unimplemented surface type (" + << static_cast(element->get_surface_type()) + << ") for annular aperture (" + << static_cast(element->get_aperture_type()); + throw std::runtime_error(ss.str()); + } + } + if (element->get_aperture_type() == ApertureType::CIRCLE) { if (element->get_surface_type() == SurfaceType::FLAT) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp index 571d706b..62df1ae6 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp @@ -44,7 +44,8 @@ const std::map IntersectionKernelMap = { {OpticalEntityType::CYLINDRICAL, "__intersection__cylinder_y"}, {OpticalEntityType::QUADRILATERAL_FLAT, "__intersection__quadrilateral_flat"}, {OpticalEntityType::CIRCLE_FLAT, "__intersection__circle_flat"}, - {OpticalEntityType::HEXAGON_FLAT, "__intersection__hexagon_flat"}}; + {OpticalEntityType::HEXAGON_FLAT, "__intersection__hexagon_flat"}, + {OpticalEntityType::ANNULUS_FLAT, "__intersection__annulus_flat"}}; pipelineManager::pipelineManager(SoltraceState &state) : m_state(state) {} diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h index b133146e..d98569a8 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_type.h @@ -11,7 +11,8 @@ namespace OptixCSP CIRCLE, TRIANGLE, QUADRILATERAL, - HEXAGON + HEXAGON, + ANNULUS }; // types for both scene building and pipeline assembly diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h index 100f474b..14c83bf3 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h @@ -8,10 +8,11 @@ #define assert(x) /*nop*/ #endif -// TODO: get rid of ST suffix, no clue what it was for ... +// TODO: get rid of ST suffix, no clue what it was for ... -namespace OptixCSP { - struct GeometryDataST +namespace OptixCSP +{ + struct GeometryDataST { enum Type { @@ -20,22 +21,20 @@ namespace OptixCSP { RECTANGLE_PARABOLIC = 2, UNKNOWN_TYPE = 3, RECTANGLE_FLAT = 4, - TRIANGLE_FLAT = 5, + TRIANGLE_FLAT = 5, QUADRILATERAL_FLAT = 6, CIRCLE_FLAT = 7, - HEXAGON_FLAT = 8 + HEXAGON_FLAT = 8 }; struct Parallelogram { Parallelogram() = default; Parallelogram(float3 v1, float3 v2, float3 anchor) - : v1(v1) - , v2(v2) - , anchor(anchor) + : v1(v1), v2(v2), anchor(anchor) { float3 normal = normalize(cross(v1, v2)); - float d = dot(normal, anchor); + float d = dot(normal, anchor); this->v1 *= 1.0f / dot(v1, v1); this->v2 *= 1.0f / dot(v2, v2); plane = make_float4(normal, d); @@ -54,7 +53,7 @@ namespace OptixCSP { : center(center), x(x), y(y), width(width), height(height) { float3 normal = normalize(cross(x, y)); - float d = dot(normal, center); + float d = dot(normal, center); plane = make_float4(normal, d); } @@ -66,34 +65,28 @@ namespace OptixCSP { float height; }; - struct Cylinder_Y { + struct Cylinder_Y + { Cylinder_Y() = default; Cylinder_Y(float3 center, float radius, float half_height, float3 base_x, float3 base_z) - : center(center) - , radius(radius) - , half_height(half_height) - , base_x(base_x) - , base_z(base_z) { + : center(center), radius(radius), half_height(half_height), base_x(base_x), base_z(base_z) + { assert(dot(base_x, base_z) < 1e-3f); } - float3 center; float radius; float half_height; - float3 base_x; // x axis of the cylinder - float3 base_z; // z axis of the cylinder + float3 base_x; // x axis of the cylinder + float3 base_z; // z axis of the cylinder }; - struct Rectangle_Parabolic { + struct Rectangle_Parabolic + { Rectangle_Parabolic() = default; Rectangle_Parabolic(float3 v1, float3 v2, float3 anchor, float curv_x, float curv_y) - : v1(v1) - , v2(v2) - , anchor(anchor) - , curv_x(curv_x) - , curv_y(curv_y) + : v1(v1), v2(v2), anchor(anchor), curv_x(curv_x), curv_y(curv_y) { float3 normal = normalize(cross(v1, v2)); float d = dot(normal, anchor); @@ -106,14 +99,15 @@ namespace OptixCSP { float3 v1; float3 v2; float3 anchor; - //float3 focus; + // float3 focus; float curv_x; float curv_y; }; - struct Triangle_Flat { + struct Triangle_Flat + { Triangle_Flat() = default; - Triangle_Flat(const float3& a, const float3& b, const float3& c) + Triangle_Flat(const float3 &a, const float3 &b, const float3 &c) : v0(a), e1(b - a), e2(c - a) { normal = normalize(cross(e1, e2)); @@ -122,10 +116,11 @@ namespace OptixCSP { float3 v0; // base vertex float3 e1, e2; // edges float3 normal; - float d; // plane distance + float d; // plane distance }; - struct Quadrilateral_Flat{ + struct Quadrilateral_Flat + { Quadrilateral_Flat() = default; Quadrilateral_Flat(const float3 &a, const float3 &b, const float3 &c, const float3 &d) @@ -136,14 +131,15 @@ namespace OptixCSP { normal = normalize(cross(e1, e2)); } float3 p0, p1, p2, p3; // Vertices in counterclockwise order - float3 normal; // Positive direction follows right-hand rule + float3 normal; // Positive direction follows right-hand rule }; - struct Circle_Flat{ + struct Circle_Flat + { Circle_Flat() = default; // Circle_Flat(const float radius) : r(radius) {} Circle_Flat(const float3 &origin, const float3 &normal, const float &radius) - : r(radius), center(origin) + : r(radius), center(origin) { plane = make_float4(normalize(normal), dot(center, normal)); } @@ -152,10 +148,11 @@ namespace OptixCSP { float r; }; - struct Hexagon_Flat{ + struct Hexagon_Flat + { Hexagon_Flat() = default; Hexagon_Flat(const float3 &origin, const float3 &normal, const float &side_length) - : s(side_length), center(origin) + : s(side_length), center(origin) { plane = make_float4(normalize(normal), dot(center, normal)); } @@ -164,72 +161,86 @@ namespace OptixCSP { float s; }; + struct Annulus_Flat + { + Annulus_Flat() = default; + Annulus_Flat(const float3 &origin, const float3 &normal, + const float &r_inner, const float &r_outer) + { + plane = make_float4(normalize(normal), dot(center, normal)); + } + float4 plane; + float3 center; + float ri; + float ro; + }; + GeometryDataST() = default; - void setParallelogram(const Parallelogram& p) + void setParallelogram(const Parallelogram &p) { assert(type == UNKNOWN_TYPE); type = PARALLELOGRAM; parallelogram = p; } - __host__ __device__ const Parallelogram& getParallelogram() const + __host__ __device__ const Parallelogram &getParallelogram() const { assert(type == PARALLELOGRAM); return parallelogram; } - void setRectangle_Flat(const Rectangle_Flat& r) + void setRectangle_Flat(const Rectangle_Flat &r) { assert(type == UNKNOWN_TYPE); type = RECTANGLE_FLAT; rectangle_flat = r; } - __host__ __device__ const Rectangle_Flat& getRectangle_Flat() const + __host__ __device__ const Rectangle_Flat &getRectangle_Flat() const { assert(type == RECTANGLE_FLAT); return rectangle_flat; } - void setCylinder_Y(const Cylinder_Y& c) + void setCylinder_Y(const Cylinder_Y &c) { assert(type == UNKNOWN_TYPE); type = CYLINDER_Y; cylinder_y = c; } - __host__ __device__ const Cylinder_Y& getCylinder_Y() const + __host__ __device__ const Cylinder_Y &getCylinder_Y() const { assert(type == CYLINDER_Y); return cylinder_y; } - void setRectangleParabolic(const Rectangle_Parabolic& r) + void setRectangleParabolic(const Rectangle_Parabolic &r) { assert(type == UNKNOWN_TYPE); type = RECTANGLE_PARABOLIC; rectangle_parabolic = r; } - __host__ __device__ const Rectangle_Parabolic& getRectangleParabolic() const + __host__ __device__ const Rectangle_Parabolic &getRectangleParabolic() const { assert(type == RECTANGLE_PARABOLIC); return rectangle_parabolic; } - void setTriangle_Flat(const Triangle_Flat& t) + void setTriangle_Flat(const Triangle_Flat &t) { assert(type == UNKNOWN_TYPE); type = TRIANGLE_FLAT; triangle_flat = t; - } + } - __host__ __device__ const Triangle_Flat& getTriangle_Flat() const + __host__ __device__ const Triangle_Flat &getTriangle_Flat() const { assert(type == TRIANGLE_FLAT); return triangle_flat; - } + } void setQuadrilateral_Flat(const Quadrilateral_Flat &q) { @@ -238,7 +249,7 @@ namespace OptixCSP { quadrilateral_flat = q; } - __host__ __device__ const Quadrilateral_Flat& getQuadrilateral_Flat() const + __host__ __device__ const Quadrilateral_Flat &getQuadrilateral_Flat() const { assert(type == QUADRILATERAL_FLAT); return quadrilateral_flat; @@ -251,7 +262,7 @@ namespace OptixCSP { circle_flat = c; } - __host__ __device__ const Circle_Flat& getCircle_Flat() const + __host__ __device__ const Circle_Flat &getCircle_Flat() const { assert(type == CIRCLE_FLAT); return circle_flat; @@ -264,12 +275,25 @@ namespace OptixCSP { hexagon_flat = h; } - __host__ __device__ const Hexagon_Flat& getHexagon_Flat() const + __host__ __device__ const Hexagon_Flat &getHexagon_Flat() const { assert(type == HEXAGON_FLAT); return hexagon_flat; } + void setAnnulus_Flat(const Annulus_Flat &anf) + { + assert(type == ANNULUS_FLAT); + type = ANNULUS_FLAT; + annulus_flat = anf; + } + + __host__ __device__ const Annulus_Flat &getAnnulus_Flat() const + { + assert(type == ANNULUS_FLAT); + return annulus_flat; + } + Type type = UNKNOWN_TYPE; int32_t id = OptixCSP::kElementIdUnassigned; @@ -281,10 +305,11 @@ namespace OptixCSP { Cylinder_Y cylinder_y; Rectangle_Parabolic rectangle_parabolic; Rectangle_Flat rectangle_flat; - Triangle_Flat triangle_flat; + Triangle_Flat triangle_flat; Quadrilateral_Flat quadrilateral_flat; Circle_Flat circle_flat; Hexagon_Flat hexagon_flat; + Annulus_Flat annulus_flat; }; }; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h index bd334452..7a393a29 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h @@ -33,6 +33,7 @@ namespace OptixCSP{ QUADRILATERAL_FLAT = 4, CIRCLE_FLAT = 5, HEXAGON_FLAT = 6, + ANNULUS_FLAT = 7, NUM_OPTICAL_ENTITY_TYPES }; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu index ac0dddcc..8ff90c60 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu @@ -649,7 +649,7 @@ extern "C" __global__ void __intersection__hexagon_flat() float3 p = ray_orig + ray_dir * t - hex.center; // float d = length(p - circ.center); float s = hex.s; - float xl = 0.5f * s; + float xl = 0.5f * s; float yl = 0.5f * sqrtf(3.0f) * s; if (-xl <= p.x && p.x <= xl && -yl <= p.y && p.y <= yl) { @@ -659,7 +659,7 @@ extern "C" __global__ void __intersection__hexagon_flat() else if (-s <= p.x && p.x < xl) { // Left side - float y1 = sqrtf(3.0f) * (p.x + s); + float y1 = sqrtf(3.0f) * (p.x + s); float y2 = -y1; if (y1 <= p.y && p.y <= y2) { @@ -669,14 +669,14 @@ extern "C" __global__ void __intersection__hexagon_flat() else if (xl < p.x && p.x <= s) { // Right side - float y1 = sqrtf(3.0f) * (p.x - s); + float y1 = sqrtf(3.0f) * (p.x - s); float y2 = -y1; if (y2 <= p.y && p.y <= y1) { is_in = true; } } - + if (is_in) { optixReportIntersection(t, @@ -687,3 +687,31 @@ extern "C" __global__ void __intersection__hexagon_flat() } } } + +extern "C" __global__ void __intersection__annulus_flat() +{ + const OptixCSP::GeometryDataST::Hexagon_Flat &anf = params.geometry_data_array[optixGetPrimitiveIndex()].getAnnulus_Flat(); + + // Get ray information: origin, direction, and min/max distances over which ray should be tested + const float3 ray_orig = optixGetWorldRayOrigin(); + const float3 ray_dir = optixGetWorldRayDirection(); + const float ray_tmin = optixGetRayTmin(), ray_tmax = optixGetRayTmax(); + + float t = ray_distance_to_plane(ray_orig, ray_dir, anf.plane); + const float4 n = anf.plane; + + // Verify intersection distance and Report ray intersection point + if (t > ray_tmin && t < ray_tmax) + { + float3 p = ray_orig + ray_dir * t; + float d = length(p - circ.center); + if (anf.ri <= d && d <= anf.ro) + { + optixReportIntersection(t, + 0, + __float_as_uint(n.x), + __float_as_uint(n.y), + __float_as_uint(n.z)); + } + } +} diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index 5fe41160..20c41fdf 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -222,11 +222,20 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) { case ApertureType::RECTANGLE: { - auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); assert(el_aperture != nullptr); // TODO: account for x and y coord? - auto aperture = std::make_shared(el_aperture->x_length, el_aperture->y_length); + auto aperture = std::make_shared( + el_aperture->x_length, el_aperture->y_length); + optix_el->set_aperture(aperture); + break; + } + case ApertureType::ANNULUS: + { + auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); + assert(el_aperture != nullptr); + auto aperture = std::make_shared( + el_aperture->inner_radius, el_aperture->outer_radius); optix_el->set_aperture(aperture); break; } @@ -288,7 +297,7 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) assert(el_aperture != nullptr); auto aperture = std::make_shared(el_aperture->radius_circumscribed_circle()); optix_el->set_aperture(aperture); - break; + break; } default: // std::cerr << "Unsupported aperture type in OptixCSP" << std::endl; From 1c210072fcc546c967c1d022f987bcbd44e0edf1 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 31 Mar 2026 14:18:33 -0600 Subject: [PATCH 08/60] Initial implementation of test for optix runner annulus flat --- .../geometry_intersection_test.cpp | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp index b8ec68d9..b5f77b4d 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp @@ -340,3 +340,42 @@ TEST(OptixRunner, FlatHexagon) EXPECT_TRUE(aper->is_in(p1[0], p1[1])); } } + +TEST(OptixRunner, FlatAnnulus) +{ + const double R0 = 5.0; + const double R1 = 10.0; + auto surf = make_surface(); + auto aper = make_aperture(R0, R1); + + SimulationData sd; + set_default_sd(sd, surf, aper); + SimulationResult result; + + OptixRunner runner; + RunnerStatus sts = runner.initialize(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.setup_simulation(&sd); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.run_simulation(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.report_simulation(&result, 0); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + + ASSERT_EQ(result.get_number_of_records(), + sd.get_simulation_parameters().number_of_rays); + for (int i = 0; i < (int)result.get_number_of_records(); ++i) + { + auto rr = result[i]; + ASSERT_GE(rr->get_number_of_interactions(), 2); + Vector3d p0, p1; + rr->get_position(0, p0); + rr->get_position(1, p1); + EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; + EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; + EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + double r = sqrt(p1[0]*p1[0] + p1[1]*p1[1]); + EXPECT_GE(r, R0); + EXPECT_LE(r, R1); + } +} From f820fe5b39d7372ae3766ab6a54594ed439276f1 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 31 Mar 2026 16:16:02 -0600 Subject: [PATCH 09/60] Fixes for flat annulus in optix runner --- .../optix_runner/OptixCSP/src/core/Aperture.cpp | 7 +++++-- .../optix_runner/OptixCSP/src/core/Aperture.h | 6 ++++-- .../optix_runner/OptixCSP/src/core/CspElement.cpp | 5 +++-- .../optix_runner/OptixCSP/src/shaders/GeometryDataST.h | 9 ++++++--- .../optix_runner/OptixCSP/src/shaders/intersection.cu | 10 +++++++--- .../simulation_runner/optix_runner/optix_runner.cpp | 3 +-- .../optix_runner/geometry_intersection_test.cpp | 3 ++- 7 files changed, 28 insertions(+), 15 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp index e0a51e3f..dbe6a80b 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.cpp @@ -1,4 +1,5 @@ #include "Aperture.h" +#include "constants.hpp" using namespace OptixCSP; @@ -86,10 +87,12 @@ Vec3d ApertureQuadrilateral::get_p1() const { return m_p1; } Vec3d ApertureQuadrilateral::get_p2() const { return m_p2; } Vec3d ApertureQuadrilateral::get_p3() const { return m_p3; } -ApertureAnnulus::ApertureAnnulus() : ri(0.5), ro(1.0) {} -ApertureAnnulus::ApertureAnnulus(double r_inner, double r_outer) {} +ApertureAnnulus::ApertureAnnulus() : ri(0.5), ro(1.0), arc(2.0 * SolTrace::Data::PI) {} +ApertureAnnulus::ApertureAnnulus(double r_inner, double r_outer, double arc) + : ri(r_inner), ro(r_outer), arc(arc) {} ApertureType ApertureAnnulus::get_aperture_type() const { return ApertureType::ANNULUS; } double ApertureAnnulus::get_radius_inner() const { return ri; } double ApertureAnnulus::get_radius_outer() const { return ro; } +double ApertureAnnulus::get_arc() const { return arc; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h index 47c52bab..13168a04 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/Aperture.h @@ -126,14 +126,16 @@ namespace OptixCSP { class ApertureAnnulus : public Aperture { public: ApertureAnnulus(); - ApertureAnnulus(double ri, double ro); + ApertureAnnulus(double ri, double ro, double arc); virtual ~ApertureAnnulus() = default; virtual ApertureType get_aperture_type() const override; double get_radius_inner() const; double get_radius_outer() const; + double get_arc() const; private: double ri; double ro; + double arc; // Measured in radians }; -} \ No newline at end of file +} diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp index c5b98270..01ecad94 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp @@ -352,11 +352,12 @@ GeometryDataST CspElement::toDeviceGeometryData() const ApertureAnnulus anf = static_cast(*m_aperture); float radius_in = anf.get_radius_inner(); float radius_out = anf.get_radius_outer(); + float arc = anf.get_arc(); float3 o = OptixCSP::toFloat3(m_origin); float3 n = normalize(OptixCSP::toFloat3(m_aim_point - m_origin)); - if (surface_type == SurfaceFlat::FLAT) + if (surface_type == SurfaceType::FLAT) { - GeometryDataST::Annulus_Flat anf(o, n, radius_in, radius_out); + GeometryDataST::Annulus_Flat anf(o, n, radius_in, radius_out, arc); geometry_data.setAnnulus_Flat(anf); } } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h index 14c83bf3..217e64dd 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/GeometryDataST.h @@ -24,7 +24,8 @@ namespace OptixCSP TRIANGLE_FLAT = 5, QUADRILATERAL_FLAT = 6, CIRCLE_FLAT = 7, - HEXAGON_FLAT = 8 + HEXAGON_FLAT = 8, + ANNULUS_FLAT = 9 }; struct Parallelogram @@ -165,7 +166,8 @@ namespace OptixCSP { Annulus_Flat() = default; Annulus_Flat(const float3 &origin, const float3 &normal, - const float &r_inner, const float &r_outer) + const float &r_inner, const float &r_outer, const float &arc) + : center(origin), ri(r_inner), ro(r_outer), arc(arc) { plane = make_float4(normalize(normal), dot(center, normal)); } @@ -173,6 +175,7 @@ namespace OptixCSP float3 center; float ri; float ro; + float arc; // Arc angle in radians with x-axis in the middle }; GeometryDataST() = default; @@ -283,7 +286,7 @@ namespace OptixCSP void setAnnulus_Flat(const Annulus_Flat &anf) { - assert(type == ANNULUS_FLAT); + assert(type == UNKNOWN_TYPE); type = ANNULUS_FLAT; annulus_flat = anf; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu index 8ff90c60..25942217 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu @@ -690,7 +690,7 @@ extern "C" __global__ void __intersection__hexagon_flat() extern "C" __global__ void __intersection__annulus_flat() { - const OptixCSP::GeometryDataST::Hexagon_Flat &anf = params.geometry_data_array[optixGetPrimitiveIndex()].getAnnulus_Flat(); + const OptixCSP::GeometryDataST::Annulus_Flat &anf = params.geometry_data_array[optixGetPrimitiveIndex()].getAnnulus_Flat(); // Get ray information: origin, direction, and min/max distances over which ray should be tested const float3 ray_orig = optixGetWorldRayOrigin(); @@ -703,15 +703,19 @@ extern "C" __global__ void __intersection__annulus_flat() // Verify intersection distance and Report ray intersection point if (t > ray_tmin && t < ray_tmax) { - float3 p = ray_orig + ray_dir * t; - float d = length(p - circ.center); + float3 p = ray_orig + ray_dir * t - anf.center; + float d = length(p); if (anf.ri <= d && d <= anf.ro) { + float theta = atan2f(p.y, p.x); + if (fabsf(theta) <= 0.5f * anf.arc) + { optixReportIntersection(t, 0, __float_as_uint(n.x), __float_as_uint(n.y), __float_as_uint(n.z)); + } } } } diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index 20c41fdf..90d51d94 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -234,8 +234,7 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) { auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); assert(el_aperture != nullptr); - auto aperture = std::make_shared( - el_aperture->inner_radius, el_aperture->outer_radius); + auto aperture = std::make_shared(el_aperture->inner_radius, el_aperture->outer_radius, el_aperture->arc_angle * D2R); optix_el->set_aperture(aperture); break; } diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp index b5f77b4d..5afed903 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp @@ -345,8 +345,9 @@ TEST(OptixRunner, FlatAnnulus) { const double R0 = 5.0; const double R1 = 10.0; + const double ARC = 2 * PI; auto surf = make_surface(); - auto aper = make_aperture(R0, R1); + auto aper = make_aperture(R0, R1, ARC); SimulationData sd; set_default_sd(sd, surf, aper); From aa722bacf63448d73253adb596a415aa54b51332 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Thu, 16 Apr 2026 08:29:32 -0600 Subject: [PATCH 10/60] Hack fixes for legacy build --- coretrace/CMakeLists.txt | 7 +- coretrace/simdata_bridge.cpp | 128 +++++++++--------- .../simulation_data/simulation_parameters.hpp | 6 +- coretrace/stapi.cpp | 16 +-- 4 files changed, 78 insertions(+), 79 deletions(-) diff --git a/coretrace/CMakeLists.txt b/coretrace/CMakeLists.txt index 08087e5b..c26a5245 100644 --- a/coretrace/CMakeLists.txt +++ b/coretrace/CMakeLists.txt @@ -28,7 +28,7 @@ include_directories(. ./simulation_results ./simulation_runner ./simulation_runner/native_runner - ./simulation_runner/optix_runner + # ./simulation_runner/optix_runner ./simulation_runner/embree_runner ) @@ -58,7 +58,6 @@ set(CORETRACE_SRC treemesh.cpp types.cpp vshot.cpp - simdata_bridge.cpp ) @@ -137,7 +136,7 @@ if(SOLTRACE_BUILD_CORETRACE) PRIVATE ${embree_INCLUDE_DIRS} ) - target_link_libraries(coretrace_api PRIVATE simdata native_runner optix_runner simresult embree_runner ${embree_LIBRARIES}) + target_link_libraries(coretrace_api PRIVATE simdata native_runner simresult embree_runner ${embree_LIBRARIES}) if(MSVC) target_compile_definitions(coretrace_api PRIVATE STCORE_API_EXPORTS _STCOREDLL_) @@ -162,7 +161,7 @@ if(SOLTRACE_BUILD_CORETRACE) PRIVATE ${embree_INCLUDE_DIRS} ) - target_link_libraries(coretrace PRIVATE simdata native_runner optix_runner simresult embree_runner ${embree_LIBRARIES}) + target_link_libraries(coretrace PRIVATE simdata native_runner simresult embree_runner ${embree_LIBRARIES}) ##################################################################################################################### # diff --git a/coretrace/simdata_bridge.cpp b/coretrace/simdata_bridge.cpp index 97d91ef7..af126ddb 100644 --- a/coretrace/simdata_bridge.cpp +++ b/coretrace/simdata_bridge.cpp @@ -6,7 +6,7 @@ #include "stage_element.hpp" #include "single_element.hpp" #include "virtual_element.hpp" -#include +// #include #include #include "simdata_io.hpp" #include @@ -63,36 +63,36 @@ void convert_user_sun_data(const std::vector& sun_shape_angle, const std return; } -int assign_raydata_from_hitpoints(const std::vector& hp_vec, const std::vector& raynumber_vec, - TSystem* sys) -{ - // Assign raydata to TSystem (for legacy GUI) - sys->AllRayData.Clear(); - for (TStage* stage : sys->StageList) - sys->AllRayData.Merge(stage->RayData); - int i_element = 0; - for (float4 element : hp_vec) - { - double stage_optix = element.x; // This is the DEPTH, NOT the stage in the soltrace sense - double PosRaySurfStage[3] = { element.y, element.z, element.w }; - double CosRaySurfStage[3] = { 0,0,0 }; // Don't have cos reported from optix - int element_number = 1; // Don't get element number from optix - int raynumber = raynumber_vec[i_element]; - - // Only add ray data if it is Not the sun ray ('stage' 0) - if (stage_optix != 0) - { - sys->StageList[0]->RayData.Append(PosRaySurfStage, CosRaySurfStage, element_number, - stage_optix, raynumber); - } - i_element++; - } - - for (TStage* stage : sys->StageList) - sys->AllRayData.Merge(stage->RayData); - - return 0; -} +// int assign_raydata_from_hitpoints(const std::vector& hp_vec, const std::vector& raynumber_vec, +// TSystem* sys) +// { +// // Assign raydata to TSystem (for legacy GUI) +// sys->AllRayData.Clear(); +// for (TStage* stage : sys->StageList) +// sys->AllRayData.Merge(stage->RayData); +// int i_element = 0; +// for (float4 element : hp_vec) +// { +// double stage_optix = element.x; // This is the DEPTH, NOT the stage in the soltrace sense +// double PosRaySurfStage[3] = { element.y, element.z, element.w }; +// double CosRaySurfStage[3] = { 0,0,0 }; // Don't have cos reported from optix +// int element_number = 1; // Don't get element number from optix +// int raynumber = raynumber_vec[i_element]; + +// // Only add ray data if it is Not the sun ray ('stage' 0) +// if (stage_optix != 0) +// { +// sys->StageList[0]->RayData.Append(PosRaySurfStage, CosRaySurfStage, element_number, +// stage_optix, raynumber); +// } +// i_element++; +// } + +// for (TStage* stage : sys->StageList) +// sys->AllRayData.Merge(stage->RayData); + +// return 0; +// } int set_tstage_parameters(TSystem* sys_legacy, const SolTrace::NativeRunner::TSystem& sys_native) { @@ -402,42 +402,42 @@ int run_native_file_runner(TSystem* sys, const char* file_name, int nthreads) return 0; } -int run_optix_runner(SolTrace::Data::SimulationData& sd, TSystem* sys) -{ - OptixRunner runner; - SolTrace::Runner::RunnerStatus sts = runner.initialize(); - sts = runner.setup_simulation(&sd); - sts = runner.run_simulation_core(false); +// int run_optix_runner(SolTrace::Data::SimulationData& sd, TSystem* sys) +// { +// OptixRunner runner; +// SolTrace::Runner::RunnerStatus sts = runner.initialize(); +// sts = runner.setup_simulation(&sd); +// sts = runner.run_simulation_core(false); - std::vector hp_vec; - std::vector raynumber_vec; - std::vector element_id_vec; - runner.get_hp_output(hp_vec, raynumber_vec, element_id_vec); +// std::vector hp_vec; +// std::vector raynumber_vec; +// std::vector element_id_vec; +// runner.get_hp_output(hp_vec, raynumber_vec, element_id_vec); - assign_raydata_from_hitpoints(hp_vec, raynumber_vec, sys); +// assign_raydata_from_hitpoints(hp_vec, raynumber_vec, sys); - return 0; -} - -int run_optix_file_runner(TSystem* sys, const char* file_name) -{ - // Directly run OptixCSP using stinput file (debug use ONLY) - OptixCSP::SolTraceSystem sys_optix; - sys_optix.set_number_of_rays(sys->sim_raycount, sys->sim_raymax); - bool ok = sys_optix.read_st_input(file_name); - sys_optix.initialize(); - sys_optix.run(); - - std::vector hp_vec; - std::vector raynumber_vec; - std::vector element_id_vec; - std::vector hit_type_vec; - sys_optix.get_hp_output(hp_vec, raynumber_vec, element_id_vec, hit_type_vec); - - assign_raydata_from_hitpoints(hp_vec, raynumber_vec, sys); - - return 0; -} +// return 0; +// } + +// int run_optix_file_runner(TSystem* sys, const char* file_name) +// { +// // Directly run OptixCSP using stinput file (debug use ONLY) +// OptixCSP::SolTraceSystem sys_optix; +// sys_optix.set_number_of_rays(sys->sim_raycount, sys->sim_raymax); +// bool ok = sys_optix.read_st_input(file_name); +// sys_optix.initialize(); +// sys_optix.run(); + +// std::vector hp_vec; +// std::vector raynumber_vec; +// std::vector element_id_vec; +// std::vector hit_type_vec; +// sys_optix.get_hp_output(hp_vec, raynumber_vec, element_id_vec, hit_type_vec); + +// assign_raydata_from_hitpoints(hp_vec, raynumber_vec, sys); + +// return 0; +// } int run_embree_runner(SolTrace::Data::SimulationData& sd, TSystem* sys, const int nthreads) { diff --git a/coretrace/simulation_data/simulation_parameters.hpp b/coretrace/simulation_data/simulation_parameters.hpp index be15daa9..6fb66717 100644 --- a/coretrace/simulation_data/simulation_parameters.hpp +++ b/coretrace/simulation_data/simulation_parameters.hpp @@ -21,9 +21,6 @@ struct SimulationParameters // TODO: Figure out how to store time... DateTime sim_dt; - bool include_sun_shape_errors; - bool include_optical_errors; - std::uint_fast64_t number_of_rays; std::uint_fast64_t max_number_of_rays; double tolerance; @@ -33,6 +30,9 @@ struct SimulationParameters int seed; + bool include_sun_shape_errors; + bool include_optical_errors; + SimulationParameters() : number_of_rays(10000), max_number_of_rays(1000000), tolerance(0.0), diff --git a/coretrace/stapi.cpp b/coretrace/stapi.cpp index 14f3b10a..2ed6d9ae 100644 --- a/coretrace/stapi.cpp +++ b/coretrace/stapi.cpp @@ -787,14 +787,14 @@ STCORE_API int st_sim_run_SolTrace20(st_context_t pcxt, unsigned int seed, const case(ST_RUNNER_NATIVE_FILE): run_native_file_runner(sys, file_name, nthreads); break; - // Optix runner - case(ST_RUNNER_OPTIX): - run_optix_runner(sd, sys); - break; - // Optix direct file load runner - case(ST_RUNNER_OPTIX_FILE): - run_optix_file_runner(sys, file_name); - break; + // // Optix runner + // case(ST_RUNNER_OPTIX): + // run_optix_runner(sd, sys); + // break; + // // Optix direct file load runner + // case(ST_RUNNER_OPTIX_FILE): + // run_optix_file_runner(sys, file_name); + // break; // Embree case(ST_RUNNER_EMBREE): run_embree_runner(sd, sys, nthreads); From 6c0c77e9f9df502ea886932d77767fe8b53f3c6e Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Thu, 16 Apr 2026 08:30:04 -0600 Subject: [PATCH 11/60] Changes to simdriver for scaling workflow --- coretrace/simdriver/main.cpp | 205 +++++++++++++----- .../embree_runner/trace_embree.cpp | 3 +- .../OptixCSP/src/shaders/intersection.cu | 20 +- .../embree_runner_multithreading_test.cpp | 2 +- 4 files changed, 161 insertions(+), 69 deletions(-) diff --git a/coretrace/simdriver/main.cpp b/coretrace/simdriver/main.cpp index 2cb9fff3..14ff41e4 100644 --- a/coretrace/simdriver/main.cpp +++ b/coretrace/simdriver/main.cpp @@ -2,15 +2,17 @@ * @file main.cpp * @brief Command-line driver for SolTrace ray tracing. * - * Reads a JSON file to configure SimulationData, runs the ray tracer, + * Reads a JSON or .stinput file to configure SimulationData, runs the ray tracer, * and writes the ray interaction records to a CSV file. * * Usage: - * simdriver [options] + * simdriver [] [options] * * Options: * --threads Number of parallel threads (default: 1) * --rays Override the number of rays from the JSON file + * --no-output Skip result retrieval and CSV output (output file not + * required when this flag is set) * --embree Use the Embree runner (only available if built with * SOLTRACE_BUILD_EMBREE_SUPPORT=ON; falls back to native * runner with a warning if Embree support is absent) @@ -47,11 +49,13 @@ using SolTrace::Runner::RunnerStatus; static void print_usage(const char *prog) { std::cerr - << "Usage: " << prog << " [options]\n" + << "Usage: " << prog << " [] [options]\n" << "\n" << "Options:\n" << " --threads Number of threads (default: 1)\n" << " --rays Override number of rays specified in the JSON file\n" + << " --no-output Skip result retrieval and CSV output\n" + << " (output file argument not required with this flag)\n" #ifdef SOLTRACE_EMBREE_SUPPORT << " --embree Use Embree runner instead of the native runner\n" << " (requires SOLTRACE_BUILD_EMBREE_SUPPORT=ON at build time)\n" @@ -65,21 +69,42 @@ static void print_usage(const char *prog) int main(int argc, char *argv[]) { - if (argc < 3) + if (argc < 2) { print_usage(argv[0]); return EXIT_FAILURE; } + // Pre-scan for --no-output so we know whether output_file is required + bool skip_output = false; + for (int i = 2; i < argc; ++i) + { + if (std::string(argv[i]) == "--no-output") + { + skip_output = true; + break; + } + } + + if (!skip_output && argc < 3) + { + std::cerr << "Error: output file is required unless --no-output is specified\n"; + print_usage(argv[0]); + return EXIT_FAILURE; + } + const std::string input_file = argv[1]; - const std::string output_file = argv[2]; + // output_file is only meaningful when skip_output is false + const std::string output_file = (!skip_output && argc >= 3) ? argv[2] : ""; int num_threads = 1; long long num_rays_override = -1; // -1 means use what the JSON specifies bool use_embree = false; bool use_optix = false; - for (int i = 3; i < argc; ++i) + // Start parsing options from argv[2] if skip_output, else from argv[3] + const int opts_start = skip_output ? 2 : 3; + for (int i = opts_start; i < argc; ++i) { const std::string arg = argv[i]; if (arg == "--threads") @@ -126,6 +151,10 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } } + else if (arg == "--no-output") + { + // already handled in pre-scan; skip here + } else if (arg == "--embree") { use_embree = true; @@ -143,23 +172,55 @@ int main(int argc, char *argv[]) } // ------------------------------------------------------------------------- - // Load simulation data from JSON + // Load simulation data from JSON or .stinput file // ------------------------------------------------------------------------- SimulationData simData; - try { + // Determine format by extension + auto ends_with = [](const std::string &s, const std::string &suffix) + { + return s.size() >= suffix.size() && + s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0; + }; + const bool is_stinput = ends_with(input_file, ".stinput"); + const bool is_json = ends_with(input_file, ".json"); + + if (!is_stinput && !is_json) + { + std::cerr << "Error: unrecognised input file extension (expected .json or .stinput): " + << input_file << "\n"; + return EXIT_FAILURE; + } + std::cout << "Loading simulation data from: " << input_file << "...\n"; auto t_load_start = std::chrono::steady_clock::now(); - simData.import_json_file(input_file); + + if (is_json) + { + try + { + simData.import_json_file(input_file); + } + catch (const std::exception &e) + { + std::cerr << "Error loading JSON file: " << e.what() << "\n"; + return EXIT_FAILURE; + } + } + else // .stinput + { + if (!simData.import_from_file(input_file)) + { + std::cerr << "Error loading .stinput file: " << input_file << "\n"; + return EXIT_FAILURE; + } + } + auto t_load_end = std::chrono::steady_clock::now(); std::cout << " Loaded in " << std::chrono::duration(t_load_end - t_load_start).count() - << " s\n"; - } - catch (const std::exception &e) - { - std::cerr << "Error loading JSON file: " << e.what() << "\n"; - return EXIT_FAILURE; + << " s\n" + << " Elements loaded: " << simData.get_number_of_elements() << "\n"; } // Override ray count if the user requested it @@ -218,23 +279,30 @@ int main(int argc, char *argv[]) << std::chrono::duration(t_run_end - t_run_start).count() << " s\n"; - std::cout << "Retrieving results...\n"; - auto t_report_start = std::chrono::steady_clock::now(); - sts = runner.report_simulation(&result, 0); - auto t_report_end = std::chrono::steady_clock::now(); - if (sts != RunnerStatus::SUCCESS) + if (!skip_output) { - std::cerr << "Error: failed to collect simulation results\n"; - return EXIT_FAILURE; + std::cout << "Retrieving results...\n"; + auto t_report_start = std::chrono::steady_clock::now(); + sts = runner.report_simulation(&result, 0); + auto t_report_end = std::chrono::steady_clock::now(); + if (sts != RunnerStatus::SUCCESS) + { + std::cerr << "Error: failed to collect simulation results\n"; + return EXIT_FAILURE; + } + std::cout << " Retrieved in " + << std::chrono::duration(t_report_end - t_report_start).count() + << " s\n"; + } + else + { + std::cout << "Skipping result retrieval (--no-output).\n"; } - std::cout << " Retrieved in " - << std::chrono::duration(t_report_end - t_report_start).count() - << " s\n"; } else #endif #ifdef SOLTRACE_OPTIX_SUPPORT - if (use_optix) + if (use_optix) { OptixRunner runner; @@ -273,18 +341,25 @@ int main(int argc, char *argv[]) << std::chrono::duration(t_run_end - t_run_start).count() << " s\n"; - std::cout << "Retrieving results...\n"; - auto t_report_start = std::chrono::steady_clock::now(); - sts = runner.report_simulation(&result, 0); - auto t_report_end = std::chrono::steady_clock::now(); - if (sts != RunnerStatus::SUCCESS) + if (!skip_output) { - std::cerr << "Error: failed to collect simulation results\n"; - return EXIT_FAILURE; + std::cout << "Retrieving results...\n"; + auto t_report_start = std::chrono::steady_clock::now(); + sts = runner.report_simulation(&result, 0); + auto t_report_end = std::chrono::steady_clock::now(); + if (sts != RunnerStatus::SUCCESS) + { + std::cerr << "Error: failed to collect simulation results\n"; + return EXIT_FAILURE; + } + std::cout << " Retrieved in " + << std::chrono::duration(t_report_end - t_report_start).count() + << " s\n"; + } + else + { + std::cout << "Skipping result retrieval (--no-output).\n"; } - std::cout << " Retrieved in " - << std::chrono::duration(t_report_end - t_report_start).count() - << " s\n"; } else #endif @@ -342,38 +417,52 @@ int main(int argc, char *argv[]) << std::chrono::duration(t_run_end - t_run_start).count() << " s\n"; - std::cout << "Retrieving results...\n"; - auto t_report_start = std::chrono::steady_clock::now(); - sts = runner.report_simulation(&result, 0); - auto t_report_end = std::chrono::steady_clock::now(); - if (sts != RunnerStatus::SUCCESS) + if (!skip_output) { - std::cerr << "Error: failed to collect simulation results\n"; - return EXIT_FAILURE; + std::cout << "Retrieving results...\n"; + auto t_report_start = std::chrono::steady_clock::now(); + sts = runner.report_simulation(&result, 0); + auto t_report_end = std::chrono::steady_clock::now(); + if (sts != RunnerStatus::SUCCESS) + { + std::cerr << "Error: failed to collect simulation results\n"; + return EXIT_FAILURE; + } + std::cout << " Retrieved in " + << std::chrono::duration(t_report_end - t_report_start).count() + << " s\n"; + } + else + { + std::cout << "Skipping result retrieval (--no-output).\n"; } - std::cout << " Retrieved in " - << std::chrono::duration(t_report_end - t_report_start).count() - << " s\n"; } // ------------------------------------------------------------------------- // Write results to CSV // ------------------------------------------------------------------------- - std::cout << "Writing " << result.get_number_of_records() - << " ray records to: " << output_file << "...\n"; - try + if (!skip_output) { - auto t_write_start = std::chrono::steady_clock::now(); - result.write_csv_file(output_file); - auto t_write_end = std::chrono::steady_clock::now(); - std::cout << " Written in " - << std::chrono::duration(t_write_end - t_write_start).count() - << " s\n"; + std::cout << "Writing " << result.get_number_of_records() + << " ray records to: " << output_file << "...\n"; + try + { + auto t_write_start = std::chrono::steady_clock::now(); + result.write_csv_file(output_file); + auto t_write_end = std::chrono::steady_clock::now(); + std::cout << " Written in " + << std::chrono::duration(t_write_end - t_write_start).count() + << " s\n"; + } + catch (const std::exception &e) + { + std::cerr << "Error writing CSV file: " << e.what() << "\n"; + return EXIT_FAILURE; + } } - catch (const std::exception &e) + else { - std::cerr << "Error writing CSV file: " << e.what() << "\n"; - return EXIT_FAILURE; + std::cout << "Skipping CSV output (--no-output).\n"; } std::cout << "Done.\n"; diff --git a/coretrace/simulation_runner/embree_runner/trace_embree.cpp b/coretrace/simulation_runner/embree_runner/trace_embree.cpp index c87bd23c..90d9c759 100644 --- a/coretrace/simulation_runner/embree_runner/trace_embree.cpp +++ b/coretrace/simulation_runner/embree_runner/trace_embree.cpp @@ -212,7 +212,8 @@ namespace SolTrace::EmbreeRunner myrng, PosSunStage.data, Stage->Origin, Stage->RLocToRef, &System->Sun, PosRayGlob, CosRayGlob, PosRaySun); - System->SunRayCount++; + // System->SunRayCount++; + ++sun_ray_count_local; } else { diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu index 25942217..6f42d353 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu @@ -645,6 +645,7 @@ extern "C" __global__ void __intersection__hexagon_flat() // Verify intersection distance and Report ray intersection point if (t > ray_tmin && t < ray_tmax) { + // TODO: Need to adjust for possible rotation... bool is_in = false; float3 p = ray_orig + ray_dir * t - hex.center; // float d = length(p - circ.center); @@ -703,19 +704,20 @@ extern "C" __global__ void __intersection__annulus_flat() // Verify intersection distance and Report ray intersection point if (t > ray_tmin && t < ray_tmax) { + // TODO: Need to adjust for possible rotation... float3 p = ray_orig + ray_dir * t - anf.center; float d = length(p); if (anf.ri <= d && d <= anf.ro) { - float theta = atan2f(p.y, p.x); - if (fabsf(theta) <= 0.5f * anf.arc) - { - optixReportIntersection(t, - 0, - __float_as_uint(n.x), - __float_as_uint(n.y), - __float_as_uint(n.z)); - } + float theta = atan2f(p.y, p.x); + if (fabsf(theta) <= 0.5f * anf.arc) + { + optixReportIntersection(t, + 0, + __float_as_uint(n.x), + __float_as_uint(n.y), + __float_as_uint(n.z)); + } } } } diff --git a/google-tests/unit-tests/simulation_runner/embree_runner/embree_runner_multithreading_test.cpp b/google-tests/unit-tests/simulation_runner/embree_runner/embree_runner_multithreading_test.cpp index 7d9eedb6..c70054cf 100644 --- a/google-tests/unit-tests/simulation_runner/embree_runner/embree_runner_multithreading_test.cpp +++ b/google-tests/unit-tests/simulation_runner/embree_runner/embree_runner_multithreading_test.cpp @@ -73,7 +73,7 @@ TEST(EmbreeRunner, CancelMultithread) auto fsts = std::async(&EmbreeRunner::run_simulation, &runner); // Give time to start processing - std::this_thread::sleep_for(std::chrono::milliseconds(500)); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); sts = runner.status_simulation(); EXPECT_EQ(sts, RunnerStatus::RUNNING); From b8ca9d09da4c17d0821797e32da2c362b7c61350 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 22 Apr 2026 13:49:09 -0600 Subject: [PATCH 12/60] Set recommended hardware flags for embree; fix an unecessary copy in NativeRunner::report_simulation --- .../embree_runner/ftz_daz.hpp | 36 ++ .../embree_runner/trace_embree.cpp | 354 +++++++++++++----- .../native_runner/native_runner.cpp | 2 +- 3 files changed, 306 insertions(+), 86 deletions(-) create mode 100644 coretrace/simulation_runner/embree_runner/ftz_daz.hpp diff --git a/coretrace/simulation_runner/embree_runner/ftz_daz.hpp b/coretrace/simulation_runner/embree_runner/ftz_daz.hpp new file mode 100644 index 00000000..341ff7c0 --- /dev/null +++ b/coretrace/simulation_runner/embree_runner/ftz_daz.hpp @@ -0,0 +1,36 @@ +#ifndef SOLTRACE_FTZ_DAZ_HPP +#define SOLTRACE_FTZ_DAZ_HPP + +// Set Flush-to-Zero (FTZ) and Denormals-are-Zero (DAZ) floating-point flags +// for the calling thread. These are thread-local CPU register settings that +// avoid slow denormal handling in the FPU, as recommended by the Embree docs. +// On ARM, DAZ is implicit when FZ is set (no separate bit). + +#if defined(__SSE__) || defined(_M_X64) || defined(_M_IX86) +# include +# include +# define SOLTRACE_SET_FTZ_DAZ() \ + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \ + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON) +#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) +# if defined(_MSC_VER) +# include +# define SOLTRACE_SET_FTZ_DAZ() do { \ + uint64_t _fpcr = _ReadStatusReg(ARM64_FPCR); \ + _fpcr |= (1ULL << 24); \ + _WriteStatusReg(ARM64_FPCR, _fpcr); \ + } while(0) +# else + /* GCC / Clang on ARM64 */ +# define SOLTRACE_SET_FTZ_DAZ() do { \ + uint64_t _fpcr; \ + __asm__ __volatile__("mrs %0, fpcr" : "=r"(_fpcr)); \ + _fpcr |= (1ULL << 24); \ + __asm__ __volatile__("msr fpcr, %0" : : "r"(_fpcr)); \ + } while(0) +# endif +#else +# define SOLTRACE_SET_FTZ_DAZ() /* unsupported architecture, no-op */ +#endif + +#endif // SOLTRACE_FTZ_DAZ_HPP diff --git a/coretrace/simulation_runner/embree_runner/trace_embree.cpp b/coretrace/simulation_runner/embree_runner/trace_embree.cpp index 90d9c759..10cf1f5f 100644 --- a/coretrace/simulation_runner/embree_runner/trace_embree.cpp +++ b/coretrace/simulation_runner/embree_runner/trace_embree.cpp @@ -1,7 +1,9 @@ #include "trace_embree.hpp" #include +#include #include +#include #include #include #include @@ -27,6 +29,7 @@ #include "embree_helper.hpp" #include "find_element_hit_embree.hpp" +#include "ftz_daz.hpp" namespace SolTrace::EmbreeRunner { @@ -56,11 +59,16 @@ namespace SolTrace::EmbreeRunner // RTCScene embree_scene = nullptr; // bool use_shared_embree = false; - // Make device - // std::cout << "Making embree device..." << std::endl; - std::stringstream ss; - ss << "threads=" << nthreads; - embree_device = rtcNewDevice(ss.str().c_str()); + // // Make device + // // std::cout << "Making embree device..." << std::endl; + // std::stringstream ss; + // ss << "threads=" << nthreads; + // embree_device = rtcNewDevice(ss.str().c_str()); + + // TODO: Need to test this on largest scenes we expect to + // trace. Adding more threads does not significantly help + // when there are only ~6000 elements. + embree_device = rtcNewDevice("threads=1"); // std::cout << "Setting error function..." << std::endl; rtcSetDeviceErrorFunction(embree_device, error_function, NULL); @@ -98,9 +106,16 @@ namespace SolTrace::EmbreeRunner const RTCScene &embree_scene) { + // using Clock = std::chrono::steady_clock; + // using Seconds = std::chrono::duration; + + // auto t_start = Clock::now(); + System->RayData.SetUp(nthreads, NumberOfRays); System->SunRayCount = 0; + // auto t_after_setup = Clock::now(); + // Initialize Sun Vector3d PosSunStage; bool status = SolTrace::NativeRunner::SunToPrimaryStage( @@ -110,6 +125,8 @@ namespace SolTrace::EmbreeRunner if (!status) return RunnerStatus::ERROR; + // auto t_after_sun_init = Clock::now(); + uint_fast64_t rem = NumberOfRays % nthreads; uint_fast64_t nrays_per_thread = NumberOfRays / nthreads; uint_fast64_t nrays; @@ -134,7 +151,27 @@ namespace SolTrace::EmbreeRunner manager->manage(k, std::move(my_future)); } - return manager->monitor_until_completion(); + // auto t_after_launch = Clock::now(); + + RunnerStatus result = manager->monitor_until_completion(); + + // auto t_done = Clock::now(); + + // double s_setup = Seconds(t_after_setup - t_start).count(); + // double s_sun_init = Seconds(t_after_sun_init - t_after_setup).count(); + // double s_launch = Seconds(t_after_launch - t_after_sun_init).count(); + // double s_ray_trace = Seconds(t_done - t_after_launch).count(); + // double s_total = Seconds(t_done - t_start).count(); + + // std::cout << "[trace_embree] timing (nthreads=" << nthreads + // << ", rays=" << NumberOfRays << ")\n" + // << " ray_data_setup : " << s_setup << " s\n" + // << " sun_init : " << s_sun_init << " s\n" + // << " thread_launch : " << s_launch << " s\n" + // << " ray_trace : " << s_ray_trace << " s\n" + // << " total : " << s_total << " s\n"; + + return result; } RunnerStatus trace_embree_single_thread( @@ -152,6 +189,10 @@ namespace SolTrace::EmbreeRunner { // std::cout << "Thread " << thread_id << " with seed " << seed // << std::endl; + // Set flush-to-zero and denormals-are-zero for this thread to avoid + // slow denormal handling in the FPU (recommended by Embree docs). + SOLTRACE_SET_FTZ_DAZ(); + // Initialize Internal State Variables MTRand myrng(seed); @@ -175,6 +216,72 @@ namespace SolTrace::EmbreeRunner uint_fast64_t n_rays_active = NumberOfRays; uint_fast64_t sun_ray_count_local = 0; + // // Timing accumulators + // using Clock = std::chrono::steady_clock; + // using ns_t = long long; + // ns_t t_generate_ray = 0; + // ns_t t_transform_to_local = 0; + // ns_t t_find_element_hit = 0; + // ns_t t_determine_interaction = 0; + // ns_t t_process_interaction = 0; + // ns_t t_transform_to_reference = 0; + // ns_t t_ray_data_append = 0; + // ns_t t_progress_update = 0; + // uint_fast64_t n_find_element_hit = 0; + // uint_fast64_t n_determine_interaction = 0; + // uint_fast64_t n_process_interaction = 0; + // uint_fast64_t n_ray_data_append = 0; + + // auto write_timing = [&]() { + // std::string fname = "trace_embree_timing_thread_" + + // std::to_string(thread_id) + ".csv"; + // std::ofstream f(fname); + // constexpr double ns_to_s = 1.0e-9; + // ns_t t_total = t_generate_ray + t_transform_to_local + t_find_element_hit + + // t_determine_interaction + t_process_interaction + + // t_transform_to_reference + t_ray_data_append + t_progress_update; + // auto pct = [&](ns_t t) -> double { + // return t_total > 0 ? 100.0 * static_cast(t) / static_cast(t_total) : 0.0; + // }; + // f << std::fixed; + // f << "section,calls,seconds,pct_total\n"; + // f << "generate_ray,," << t_generate_ray * ns_to_s << "," << pct(t_generate_ray) << "\n"; + // f << "transform_to_local,," << t_transform_to_local * ns_to_s << "," << pct(t_transform_to_local) << "\n"; + // f << "find_element_hit," << n_find_element_hit << "," << t_find_element_hit * ns_to_s << "," << pct(t_find_element_hit) << "\n"; + // f << "determine_interaction," << n_determine_interaction << "," << t_determine_interaction * ns_to_s << "," << pct(t_determine_interaction) << "\n"; + // f << "process_interaction," << n_process_interaction << "," << t_process_interaction * ns_to_s << "," << pct(t_process_interaction) << "\n"; + // f << "transform_to_reference,,"<< t_transform_to_reference * ns_to_s << "," << pct(t_transform_to_reference) << "\n"; + // f << "ray_data_append," << n_ray_data_append << "," << t_ray_data_append * ns_to_s << "," << pct(t_ray_data_append) << "\n"; + // f << "progress_update,," << t_progress_update * ns_to_s << "," << pct(t_progress_update) << "\n"; + // f << "total,," << t_total * ns_to_s << ",100.0\n"; + // }; + + // auto write_timing = [&]() { + // // std::string fname = "trace_embree_timing_thread_" + + // // std::to_string(thread_id) + ".csv"; + // // std::ofstream f(fname); + // std::stringstream f; + // constexpr double ns_to_s = 1.0e-9; + // ns_t t_total = t_generate_ray + t_transform_to_local + t_find_element_hit + + // t_determine_interaction + t_process_interaction + + // t_transform_to_reference + t_ray_data_append + t_progress_update; + // auto pct = [&](ns_t t) -> double { + // return t_total > 0 ? 100.0 * static_cast(t) / static_cast(t_total) : 0.0; + // }; + // f << "thread_id " << thread_id << "\n" << std::fixed; + // f << "section,calls,seconds,pct_total\n"; + // f << "generate_ray,," << t_generate_ray * ns_to_s << "," << pct(t_generate_ray) << "\n"; + // f << "transform_to_local,," << t_transform_to_local * ns_to_s << "," << pct(t_transform_to_local) << "\n"; + // f << "find_element_hit," << n_find_element_hit << "," << t_find_element_hit * ns_to_s << "," << pct(t_find_element_hit) << "\n"; + // f << "determine_interaction," << n_determine_interaction << "," << t_determine_interaction * ns_to_s << "," << pct(t_determine_interaction) << "\n"; + // f << "process_interaction," << n_process_interaction << "," << t_process_interaction * ns_to_s << "," << pct(t_process_interaction) << "\n"; + // f << "transform_to_reference,,"<< t_transform_to_reference * ns_to_s << "," << pct(t_transform_to_reference) << "\n"; + // f << "ray_data_append," << n_ray_data_append << "," << t_ray_data_append * ns_to_s << "," << pct(t_ray_data_append) << "\n"; + // f << "progress_update,," << t_progress_update * ns_to_s << "," << pct(t_progress_update) << "\n"; + // f << "total,," << t_total * ns_to_s << ",100.0\n"; + // std::cout << f.str() << std::endl; + // }; + // Loop through stages for (uint_fast64_t i = 0; i < System->StageList.size(); i++) { @@ -208,10 +315,15 @@ namespace SolTrace::EmbreeRunner { // Make ray (if first stage) double PosRaySun[3]; - SolTrace::NativeRunner::GenerateRay( - myrng, PosSunStage.data, Stage->Origin, - Stage->RLocToRef, &System->Sun, - PosRayGlob, CosRayGlob, PosRaySun); + { + // auto _t0 = Clock::now(); + SolTrace::NativeRunner::GenerateRay( + myrng, PosSunStage.data, Stage->Origin, + Stage->RLocToRef, &System->Sun, + PosRayGlob, CosRayGlob, PosRaySun); + // t_generate_ray += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + } // System->SunRayCount++; ++sun_ray_count_local; } @@ -225,9 +337,14 @@ namespace SolTrace::EmbreeRunner } // transform the global incoming ray to local stage coordinates - TransformToLocal(PosRayGlob, CosRayGlob, - Stage->Origin, Stage->RRefToLoc, - PosRayStage, CosRayStage); + { + // auto _t0 = Clock::now(); + TransformToLocal(PosRayGlob, CosRayGlob, + Stage->Origin, Stage->RRefToLoc, + PosRayStage, CosRayStage); + // t_transform_to_local += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + } // Initialize internal variables for ray intersection tracing bool RayInStage = true; @@ -251,12 +368,18 @@ namespace SolTrace::EmbreeRunner while (RayInStage) { - FindElementHit_embree(embree_scene, i, RayNumber, - PosRayGlob, CosRayGlob, - LastPosRaySurfElement, LastCosRaySurfElement, - LastDFXYZ, LastElementNumber, LastRayNumber, - LastPosRaySurfStage, LastCosRaySurfStage, - ErrorFlag, LastHitBackSide, StageHit); + { + // auto _t0 = Clock::now(); + FindElementHit_embree(embree_scene, i, RayNumber, + PosRayGlob, CosRayGlob, + LastPosRaySurfElement, LastCosRaySurfElement, + LastDFXYZ, LastElementNumber, LastRayNumber, + LastPosRaySurfStage, LastCosRaySurfStage, + ErrorFlag, LastHitBackSide, StageHit); + // t_find_element_hit += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + // ++n_find_element_hit; + } // Breakout if ray left stage if (!StageHit) @@ -271,6 +394,7 @@ namespace SolTrace::EmbreeRunner if (i == 0 && MultipleHitCount == 1) { // Add ray to Stage RayData + // auto _t0_append = Clock::now(); auto r = System->RayData.Append(thread_id, PosRayGlob, CosRayGlob, @@ -278,6 +402,9 @@ namespace SolTrace::EmbreeRunner i + 1, LastRayNumber, RayEvent::CREATE); + // t_ray_data_append += std::chrono::duration_cast( + // Clock::now() - _t0_append).count(); + // ++n_ray_data_append; if (r == nullptr) { @@ -305,8 +432,10 @@ namespace SolTrace::EmbreeRunner else optics = &optelm->Optics.Front; - bool good = - SolTrace::NativeRunner::determine_interaction_type( + bool good; + { + // auto _t0 = Clock::now(); + good = SolTrace::NativeRunner::determine_interaction_type( logger, i, thread_id, @@ -315,9 +444,14 @@ namespace SolTrace::EmbreeRunner LastDFXYZ, LastCosRaySurfElement, rev); + // t_determine_interaction += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + // ++n_determine_interaction; + } if (!good) { + // write_timing(); return RunnerStatus::ERROR; } @@ -330,43 +464,60 @@ namespace SolTrace::EmbreeRunner // Process Interaction int k = LastElementNumber - 1; - SolTrace::NativeRunner::ProcessInteraction( - System, - myrng, - IncludeSunShape, - optics, - IncludeErrors, - i, - Stage, - MultipleHitCount, - LastDFXYZ, - LastCosRaySurfElement, - ErrorFlag, - CosRayOutElement, - LastPosRaySurfElement, - PosRayOutElement); + { + // auto _t0 = Clock::now(); + SolTrace::NativeRunner::ProcessInteraction( + System, + myrng, + IncludeSunShape, + optics, + IncludeErrors, + i, + Stage, + MultipleHitCount, + LastDFXYZ, + LastCosRaySurfElement, + ErrorFlag, + CosRayOutElement, + LastPosRaySurfElement, + PosRayOutElement); + // t_process_interaction += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + // ++n_process_interaction; + } // Transform ray back to stage coordinate system - TransformToReference(PosRayOutElement, - CosRayOutElement, - Stage->ElementList[k]->Origin, - Stage->ElementList[k]->RLocToRef, - PosRayStage, - CosRayStage); - TransformToReference(PosRayStage, - CosRayStage, - Stage->Origin, - Stage->RLocToRef, - PosRayGlob, - CosRayGlob); - - System->RayData.Append(thread_id, - PosRayGlob, - CosRayGlob, - LastElementNumber, - i + 1, - LastRayNumber, - rev); + { + // auto _t0 = Clock::now(); + TransformToReference(PosRayOutElement, + CosRayOutElement, + Stage->ElementList[k]->Origin, + Stage->ElementList[k]->RLocToRef, + PosRayStage, + CosRayStage); + TransformToReference(PosRayStage, + CosRayStage, + Stage->Origin, + Stage->RLocToRef, + PosRayGlob, + CosRayGlob); + // t_transform_to_reference += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + } + + { + // auto _t0 = Clock::now(); + System->RayData.Append(thread_id, + PosRayGlob, + CosRayGlob, + LastElementNumber, + i + 1, + LastRayNumber, + rev); + // t_ray_data_append += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + // ++n_ray_data_append; + } // Break out if multiple hits are not allowed if (!Stage->MultiHitsPerRay) @@ -385,29 +536,47 @@ namespace SolTrace::EmbreeRunner if (update_count % update_rate == 0) { + // auto _t0 = Clock::now(); double progress = update_count / total_work; manager->progress_update(thread_id, progress); - if (manager->terminate(thread_id)) + bool should_cancel = manager->terminate(thread_id); + // t_progress_update += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + if (should_cancel) + { + // write_timing(); return RunnerStatus::CANCEL; + } } // Handle if Ray was absorbed if (RayIsAbsorbed) { - TransformToReference(LastPosRaySurfStage, - LastCosRaySurfStage, - Stage->Origin, - Stage->RLocToRef, - PosRayGlob, - CosRayGlob); - - System->RayData.Append(thread_id, - PosRayGlob, - CosRayGlob, - LastElementNumber, - i + 1, - LastRayNumber, - RayEvent::ABSORB); + { + // auto _t0 = Clock::now(); + TransformToReference(LastPosRaySurfStage, + LastCosRaySurfStage, + Stage->Origin, + Stage->RLocToRef, + PosRayGlob, + CosRayGlob); + // t_transform_to_reference += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + } + + { + // auto _t0 = Clock::now(); + System->RayData.Append(thread_id, + PosRayGlob, + CosRayGlob, + LastElementNumber, + i + 1, + LastRayNumber, + RayEvent::ABSORB); + // t_ray_data_append += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + // ++n_ray_data_append; + } n_rays_active--; @@ -511,13 +680,19 @@ namespace SolTrace::EmbreeRunner { LastRayNumber = RayNumber; - System->RayData.Append(thread_id, - PosRayGlob, - CosRayGlob, - ELEMENT_NULL, - i + 1, - LastRayNumber, - RayEvent::EXIT); + { + // auto _t0 = Clock::now(); + System->RayData.Append(thread_id, + PosRayGlob, + CosRayGlob, + ELEMENT_NULL, + i + 1, + LastRayNumber, + RayEvent::EXIT); + // t_ray_data_append += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + // ++n_ray_data_append; + } n_rays_active--; @@ -567,6 +742,7 @@ namespace SolTrace::EmbreeRunner // size_t pp = IncomingRays[PreviousStageDataArrayIndex - 1].Num; // System->errlog("LastRayNumberInPreviousStage=0, stage %d, PrevIdx=%d, CurIdx=%d, pp=%d", i + 1, // PreviousStageDataArrayIndex, StageDataArrayIndex, pp); + // write_timing(); return RunnerStatus::ERROR; } } @@ -574,6 +750,7 @@ namespace SolTrace::EmbreeRunner { // System->errlog("Invalid PreviousStageDataArrayIndex: %u, @ stage %d", // PreviousStageDataArrayIndex, i + 1); + // write_timing(); return RunnerStatus::ERROR; } } @@ -584,18 +761,25 @@ namespace SolTrace::EmbreeRunner for (uint_fast64_t k = 0; k < n_rays_active; ++k) { GlobalRay_refactored ray = IncomingRays[k]; - System->RayData.Append(thread_id, - ray.Pos, - ray.Cos, - ELEMENT_NULL, - idx + 1, - ray.Num, - RayEvent::EXIT); + { + // auto _t0 = Clock::now(); + System->RayData.Append(thread_id, + ray.Pos, + ray.Cos, + ELEMENT_NULL, + idx + 1, + ray.Num, + RayEvent::EXIT); + // t_ray_data_append += std::chrono::duration_cast( + // Clock::now() - _t0).count(); + // ++n_ray_data_append; + } } // System->SunRayCount is atomic so this is thread safe System->SunRayCount += sun_ray_count_local; + // if (thread_id == 0) write_timing(); return RunnerStatus::SUCCESS; } diff --git a/coretrace/simulation_runner/native_runner/native_runner.cpp b/coretrace/simulation_runner/native_runner/native_runner.cpp index 9667e043..ea25b576 100644 --- a/coretrace/simulation_runner/native_runner/native_runner.cpp +++ b/coretrace/simulation_runner/native_runner/native_runner.cpp @@ -327,7 +327,7 @@ namespace SolTrace::NativeRunner const TSystem *sys = this->get_system(); // const TRayData ray_data = sys->AllRayData; - const TRayData ray_data = sys->RayData; + const TRayData& ray_data = sys->RayData; std::map ray_records; std::map::iterator iter; uint_fast64_t ndata = ray_data.Count(); From 9001288607db3f6649e5ea95a0f49af9d51ede04 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Fri, 8 May 2026 13:30:43 -0600 Subject: [PATCH 13/60] OptixRunner compile error fixes --- .../optix_runner/OptixCSP/src/core/CspElement.cpp | 11 +++++++++++ .../optix_runner/OptixCSP/src/core/CspElement.h | 7 +++++++ .../simulation_runner/optix_runner/optix_runner.cpp | 10 ++++++++-- .../optix_runner/geometry_intersection_test.cpp | 6 +++--- 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp index d1d31a18..75a5e1e0 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp @@ -22,6 +22,7 @@ CspElementBase::CspElementBase() CspElement::CspElement() { m_origin = Vec3d(0.0, 0.0, 0.0); + m_aim_point = Vec3d(0.0, 0.0, 1.0); m_rotation_matrix = Matrix33d(); m_surface = nullptr; m_aperture = nullptr; @@ -42,6 +43,16 @@ void CspElement::set_origin(const Vec3d &o) m_origin = o; } +const Vec3d &CspElement::get_aim_point() const +{ + return m_aim_point; +} + +void CspElement::set_aim_point(const Vec3d &ap) +{ + m_aim_point = ap; +} + std::shared_ptr CspElement::get_aperture() const { return m_aperture; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.h index 78a0572e..0a89c0cc 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.h @@ -26,6 +26,9 @@ namespace OptixCSP // Positioning and orientation. virtual const Vec3d &get_origin() const = 0; virtual void set_origin(const OptixCSP::Vec3d &) = 0; + virtual const Vec3d &get_aim_point() const = 0; + virtual void set_aim_point(const Vec3d &o) = 0; + // virtual const Vec3d& get_euler_angles() const = 0; // virtual void set_euler_angles(const Vec3d&) = 0; @@ -55,6 +58,9 @@ namespace OptixCSP const Vec3d &get_origin() const override; void set_origin(const Vec3d &o) override; + const Vec3d &get_aim_point() const override; + void set_aim_point(const Vec3d &o) override; + std::shared_ptr get_aperture() const; std::shared_ptr get_surface() const; ApertureType get_aperture_type() const; @@ -106,6 +112,7 @@ namespace OptixCSP const OpticalDistribution od); Vec3d m_origin; + Vec3d m_aim_point; Matrix33d m_rotation_matrix; Vec3d m_upper_box_bound; // Global coordinates diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index b1ef7394..552d3a24 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -124,8 +124,10 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) auto optix_el = std::make_shared(); auto origin = el->get_origin_global(); + auto ap = el->get_aim_vector_global(); OptixCSP::Vec3d origin_vec(origin.x, origin.y, origin.z); optix_el->set_origin(ToVec3d(origin)); + optix_el->set_aim_point(ToVec3d(ap)); optix_el->set_rotation_matrix(ToMatrix33d(el->get_local_to_global())); // Safely narrow element id to int32_t @@ -236,8 +238,12 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) auto el_aperture = std::dynamic_pointer_cast(el->get_aperture()); assert(el_aperture != nullptr); // TODO: account for x and y coord? - auto aperture = std::make_shared( - el_aperture->x_length, el_aperture->y_length); + // auto aperture = std::make_shared(el_aperture->x_length(), + // el_aperture->y_length()); + auto aperture = std::make_shared(el_aperture->x_length(), + el_aperture->y_length(), + el_aperture->x_coord(), + el_aperture->y_coord()); optix_el->set_aperture(aperture); break; } diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp index 5070a1ca..2ad27e61 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp @@ -295,7 +295,7 @@ TEST(OptixRunner, FlatCircle) { auto rr = result[i]; ASSERT_GE(rr->get_number_of_interactions(), 2); - Vector3d p0, p1; + glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; @@ -331,7 +331,7 @@ TEST(OptixRunner, FlatHexagon) { auto rr = result[i]; ASSERT_GE(rr->get_number_of_interactions(), 2); - Vector3d p0, p1; + glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; @@ -369,7 +369,7 @@ TEST(OptixRunner, FlatAnnulus) { auto rr = result[i]; ASSERT_GE(rr->get_number_of_interactions(), 2); - Vector3d p0, p1; + glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; From 5015eca2da53f1ec5bfe663ac1784d1cbb11255a Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Fri, 8 May 2026 13:46:36 -0600 Subject: [PATCH 14/60] Fix badly merged things --- coretrace/CMakeLists.txt | 4 +-- coretrace/simdata_bridge.cpp | 26 +++++++++---------- .../OptixCSP/src/core/CspElement.cpp | 16 ++++++------ 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/coretrace/CMakeLists.txt b/coretrace/CMakeLists.txt index 8f9b1e39..63deb273 100644 --- a/coretrace/CMakeLists.txt +++ b/coretrace/CMakeLists.txt @@ -28,7 +28,7 @@ include_directories(. ./simulation_results ./simulation_runner ./simulation_runner/native_runner - # ./simulation_runner/optix_runner + ./simulation_runner/optix_runner ./simulation_runner/embree_runner ) @@ -136,7 +136,7 @@ if(SOLTRACE_BUILD_CORETRACE) PRIVATE ${embree_INCLUDE_DIRS} ) - target_link_libraries(coretrace_api PRIVATE simdata native_runner simresult embree_runner ${embree_LIBRARIES}) + target_link_libraries(coretrace_api PRIVATE simdata native_runner optix_runner simresult embree_runner ${embree_LIBRARIES}) if(MSVC) target_compile_definitions(coretrace_api PRIVATE STCORE_API_EXPORTS _STCOREDLL_) diff --git a/coretrace/simdata_bridge.cpp b/coretrace/simdata_bridge.cpp index caddda9e..7dbc3a35 100644 --- a/coretrace/simdata_bridge.cpp +++ b/coretrace/simdata_bridge.cpp @@ -6,7 +6,7 @@ #include "stage_element.hpp" #include "single_element.hpp" #include "virtual_element.hpp" -// #include +#include #include #include "simdata_io.hpp" #include @@ -80,20 +80,20 @@ int assign_raydata_from_hitpoints(const std::vector& hp_vec, const std:: int element_number = 1; // Don't get element number from optix int raynumber = raynumber_vec[i_element]; -// // Only add ray data if it is Not the sun ray ('stage' 0) -// if (stage_optix != 0) -// { -// sys->StageList[0]->RayData.Append(PosRaySurfStage, CosRaySurfStage, element_number, -// stage_optix, raynumber); -// } -// i_element++; -// } + // Only add ray data if it is Not the sun ray ('stage' 0) + if (stage_optix != 0) + { + sys->StageList[0]->RayData.Append(PosRaySurfStage, CosRaySurfStage, element_number, + stage_optix, raynumber); + } + i_element++; + } -// for (TStage* stage : sys->StageList) -// sys->AllRayData.Merge(stage->RayData); + for (TStage* stage : sys->StageList) + sys->AllRayData.Merge(stage->RayData); -// return 0; -// } + return 0; +} int set_tstage_parameters(TSystem* sys_legacy, const SolTrace::NativeRunner::TSystem& sys_native) { diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp index 75a5e1e0..21f40049 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp @@ -102,7 +102,7 @@ Matrix33d CspElement::get_rotation_matrix() const return m_rotation_matrix; } -void CspElement::set_rotation_matrix(const Matrix33d& rotation_matrix) +void CspElement::set_rotation_matrix(const Matrix33d &rotation_matrix) { m_rotation_matrix = rotation_matrix; } @@ -229,14 +229,14 @@ GeometryDataST CspElement::toDeviceGeometryData() const Vec3d edge_y = v2 * (float)height; Vec3d local_anchor(x_coord + width, y_coord, 0.0); - //float3 anchor = OptixCSP::toFloat3(m_origin - v1 * 0.5 - v2 * 0.5); + // float3 anchor = OptixCSP::toFloat3(m_origin - v1 * 0.5 - v2 * 0.5); Vec3d global_anchor = rotation_matrix * local_anchor + m_origin; - GeometryDataST::Rectangle_Parabolic heliostat(OptixCSP::toFloat3(edge_x), - OptixCSP::toFloat3(edge_y), - OptixCSP::toFloat3(global_anchor), - (float)m_surface->get_curvature_1(), - (float)m_surface->get_curvature_2()); + GeometryDataST::Rectangle_Parabolic heliostat(OptixCSP::toFloat3(edge_x), + OptixCSP::toFloat3(edge_y), + OptixCSP::toFloat3(global_anchor), + (float)m_surface->get_curvature_1(), + (float)m_surface->get_curvature_2()); geometry_data.setRectangleParabolic(heliostat); } @@ -333,7 +333,7 @@ GeometryDataST CspElement::toDeviceGeometryData() const ApertureAnnulus anf = static_cast(*m_aperture); float radius_in = anf.get_radius_inner(); float radius_out = anf.get_radius_outer(); - float arc = anf.get_arc(); + float arc = anf.get_arc(); float3 o = OptixCSP::toFloat3(m_origin); float3 n = normalize(OptixCSP::toFloat3(m_aim_point - m_origin)); if (surface_type == SurfaceType::FLAT) From 248d4ccad61833abb15384bebf6328aed5bf71e0 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Fri, 8 May 2026 15:59:54 -0600 Subject: [PATCH 15/60] Add timing optix run function --- .../OptixCSP/src/core/soltrace_system.cpp | 41 ++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index f38ecacb..5cd2a60c 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -69,7 +69,7 @@ void SolTraceSystem::print_launch_params() SolTraceSystem::SolTraceSystem() : m_number_of_rays(0), m_max_number_of_rays(0), - m_verbose(false), + m_verbose(true), m_mem_free_before(0), m_mem_free_after(0), m_optical_errors(false), @@ -254,13 +254,22 @@ void SolTraceSystem::run() uint_fast64_t N_ray_hit = 0; uint_fast64_t N_ray_gen = 0; + Timer timer_setup_buffer; + Timer timer_optix_launch; + Timer timer_collect_results; + uint64_t n_iterations = 0; + while (N_ray_hit < m_number_of_rays && N_ray_gen < m_max_number_of_rays) { + ++n_iterations; + // Update ray offset (pushed to device in setup_device_buffer) data_manager->launch_params_H.ray_offset = N_ray_gen; // Allocate buffer (sets data_manager->launch_params_H buffer) + timer_setup_buffer.start(); setup_device_buffer(); + timer_setup_buffer.stop(); int width = data_manager->launch_params_H.width; int height = data_manager->launch_params_H.height; @@ -272,8 +281,8 @@ void SolTraceSystem::run() if(m_verbose) std::cout << "Memory used by launch: " << (m_mem_free_before - m_mem_free_after) / (1024.0 * 1024.0) << " MB\n"; - m_timer_trace.start(); // Launch the simulation. + timer_optix_launch.start(); OPTIX_CHECK(optixLaunch( m_state.pipeline, m_state.stream, // Assume this stream is properly created. @@ -284,10 +293,14 @@ void SolTraceSystem::run() height, 1)); CUDA_SYNC_CHECK(); + timer_optix_launch.stop(); // Collect results + timer_collect_results.start(); get_buffer_results(m_hp_vec, m_raynumber_vec, m_element_id_vec, m_hit_type_vec, m_sunraynumber_vec); + timer_collect_results.stop(); + N_ray_hit = m_raynumber_vec.empty() ? 0 : m_raynumber_vec.back(); N_ray_gen += width; } @@ -306,6 +319,30 @@ void SolTraceSystem::run() } m_timer_trace.stop(); + + if (m_verbose) + { + const double t_setup = timer_setup_buffer.get_time_sec(); + const double t_launch = timer_optix_launch.get_time_sec(); + const double t_collect = timer_collect_results.get_time_sec(); + const double t_total = t_setup + t_launch + t_collect; + const double inv_n = n_iterations > 0 ? 1.0 / static_cast(n_iterations) : 0.0; + + std::cout << "\n--- SolTraceSystem::run() timing (" << n_iterations << " iteration" + << (n_iterations == 1 ? "" : "s") << ") ---\n"; + std::cout << std::fixed << std::setprecision(6); + std::cout << " setup_device_buffer : total = " << t_setup << " s" + << " avg = " << t_setup * inv_n << " s" + << " fraction = " << (t_total > 0.0 ? 100.0 * t_setup / t_total : 0.0) << " %\n"; + std::cout << " optixLaunch : total = " << t_launch << " s" + << " avg = " << t_launch * inv_n << " s" + << " fraction = " << (t_total > 0.0 ? 100.0 * t_launch / t_total : 0.0) << " %\n"; + std::cout << " get_buffer_results : total = " << t_collect << " s" + << " avg = " << t_collect * inv_n << " s" + << " fraction = " << (t_total > 0.0 ? 100.0 * t_collect / t_total : 0.0) << " %\n"; + std::cout << " total (3 sections) : " << t_total << " s\n"; + std::cout << "----------------------------------------------\n"; + } } void SolTraceSystem::update() From f22ea2e55de986cfc1593b70a0e6f46dc078631f Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 12 May 2026 14:01:02 -0600 Subject: [PATCH 16/60] Change host side buffer to page-locked memory; integrate hit data into single structure --- .../OptixCSP/src/core/soltrace_system.cpp | 304 ++++++++++-------- .../OptixCSP/src/core/soltrace_system.h | 83 +++-- .../OptixCSP/src/shaders/Soltrace.h | 15 +- .../OptixCSP/src/shaders/materials.cu | 6 +- .../optix_runner/OptixCSP/src/shaders/sun.cu | 9 +- 5 files changed, 233 insertions(+), 184 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 5cd2a60c..05c30567 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -1,21 +1,26 @@ #include "soltrace_system.h" -#include "geometry_manager.h" + +#include "CspElement.h" #include "data_manager.h" +#include "geometry_manager.h" #include "pipeline_manager.h" +#include "soltrace_constants.h" #include "soltrace_type.h" -#include "CspElement.h" #include "timer.h" -#include "soltrace_constants.h" + #include "../../../../../simulation_data/simdata_io.hpp" #include "../../../../../simulation_data/solar_position_calculators/basic_sun_position.hpp" +#include "shaders/Soltrace.h" + #include "utils/util_record.hpp" #include "utils/util_check.hpp" #include "utils/math_util.h" + +#include #include #include #include -#include #include #include @@ -28,44 +33,6 @@ using namespace OptixCSP; // note that this is has to be per optical entity type. typedef Record HitGroupRecord; -void SolTraceSystem::set_verbose(bool verbose) -{ - m_verbose = verbose; - geometry_manager->set_verbose(verbose); - pipeline_manager->set_verbose(verbose); -} - -void SolTraceSystem::print_launch_params() -{ - if (!m_verbose) - { - return; - } - - LaunchParams params = data_manager->launch_params_H; - - float3 sun_box_a = params.sun_v0 - params.sun_v1; - float3 sun_box_b = params.sun_v1 - params.sun_v2; - - float sun_box_edge_a = sqrtf(sun_box_a.x * sun_box_a.x + sun_box_a.y * sun_box_a.y + sun_box_a.z * sun_box_a.z); - float sun_box_edge_b = sqrtf(sun_box_b.x * sun_box_b.x + sun_box_b.y * sun_box_b.y + sun_box_b.z * sun_box_b.z); - - std::cout << "print launch params: " << std::endl; - std::cout << "width : " << params.width << std::endl; - std::cout << "height : " << params.height << std::endl; - std::cout << "max_depth : " << params.max_depth << std::endl; - std::cout << "hit_point_buffer : " << params.hit_point_buffer << std::endl; - std::cout << "sun_dir_buffer : " << params.sun_dir_buffer << std::endl; - std::cout << "sun_vector : " << params.sun_vector.x << " " << params.sun_vector.y << " " << params.sun_vector.z << std::endl; - // std::cout << "max_sun_angle : " << params.max_sun_angle << std::endl; - std::cout << "sun_v0 : " << params.sun_v0.x << " " << params.sun_v0.y << " " << params.sun_v0.z << std::endl; - std::cout << "sun_v1 : " << params.sun_v1.x << " " << params.sun_v1.y << " " << params.sun_v1.z << std::endl; - std::cout << "sun_v2 : " << params.sun_v2.x << " " << params.sun_v2.y << " " << params.sun_v2.z << std::endl; - std::cout << "sun_v3 : " << params.sun_v3.x << " " << params.sun_v3.y << " " << params.sun_v3.z << std::endl; - std::cout << "sun_box_edge_a : " << sun_box_edge_a << std::endl; - std::cout << "sun_box_edge_b : " << sun_box_edge_b << std::endl; -} - SolTraceSystem::SolTraceSystem() : m_number_of_rays(0), m_max_number_of_rays(0), @@ -73,6 +40,8 @@ SolTraceSystem::SolTraceSystem() m_mem_free_before(0), m_mem_free_after(0), m_optical_errors(false), + m_hit_buffer_host(nullptr), + m_hit_buffer_host_capacity(0), m_include_sun_shape_errors(false), m_timer_setup(), m_timer_trace(), @@ -87,9 +56,9 @@ SolTraceSystem::SolTraceSystem() if (m_verbose) { std::cout << "Using OPTIX Version: " << major - << "." << minor - << "." << micro - << std::endl; + << "." << minor + << "." << micro + << std::endl; } CUDA_CHECK(cudaFree(0)); @@ -111,6 +80,45 @@ SolTraceSystem::~SolTraceSystem() { } +void SolTraceSystem::set_verbose(bool verbose) +{ + m_verbose = verbose; + geometry_manager->set_verbose(verbose); + pipeline_manager->set_verbose(verbose); +} + +void SolTraceSystem::print_launch_params() +{ + if (!m_verbose) + { + return; + } + + LaunchParams params = data_manager->launch_params_H; + + float3 sun_box_a = params.sun_v0 - params.sun_v1; + float3 sun_box_b = params.sun_v1 - params.sun_v2; + + float sun_box_edge_a = sqrtf(sun_box_a.x * sun_box_a.x + sun_box_a.y * sun_box_a.y + sun_box_a.z * sun_box_a.z); + float sun_box_edge_b = sqrtf(sun_box_b.x * sun_box_b.x + sun_box_b.y * sun_box_b.y + sun_box_b.z * sun_box_b.z); + + std::cout << "print launch params: " << std::endl; + std::cout << "width : " << params.width << std::endl; + std::cout << "height : " << params.height << std::endl; + std::cout << "max_depth : " << params.max_depth << std::endl; + // std::cout << "hit_point_buffer : " << params.hit_point_buffer << std::endl; + std::cout << "hit_buffer : " << params.hit_buffer << std::endl; + std::cout << "sun_dir_buffer : " << params.sun_dir_buffer << std::endl; + std::cout << "sun_vector : " << params.sun_vector.x << " " << params.sun_vector.y << " " << params.sun_vector.z << std::endl; + // std::cout << "max_sun_angle : " << params.max_sun_angle << std::endl; + std::cout << "sun_v0 : " << params.sun_v0.x << " " << params.sun_v0.y << " " << params.sun_v0.z << std::endl; + std::cout << "sun_v1 : " << params.sun_v1.x << " " << params.sun_v1.y << " " << params.sun_v1.z << std::endl; + std::cout << "sun_v2 : " << params.sun_v2.x << " " << params.sun_v2.y << " " << params.sun_v2.z << std::endl; + std::cout << "sun_v3 : " << params.sun_v3.x << " " << params.sun_v3.y << " " << params.sun_v3.z << std::endl; + std::cout << "sun_box_edge_a : " << sun_box_edge_a << std::endl; + std::cout << "sun_box_edge_b : " << sun_box_edge_b << std::endl; +} + void SolTraceSystem::initialize() { @@ -144,14 +152,14 @@ void SolTraceSystem::initialize() // Set generation type switch (m_sun->get_gen_type()) { - case(SolTrace::Data::GenType::RANDOM): - data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::RANDOM; - break; - case(SolTrace::Data::GenType::HALTON): - data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::HALTON; - break; - default: - data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::UNKNOWN; + case (SolTrace::Data::GenType::RANDOM): + data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::RANDOM; + break; + case (SolTrace::Data::GenType::HALTON): + data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::HALTON; + break; + default: + data_manager->launch_params_H.sun_gen_type = OptixCSP::GenType::UNKNOWN; } // Assign sun shape parameters (if necessary) @@ -227,7 +235,7 @@ void SolTraceSystem::initialize() data_manager->allocateGeometryDataArray(geometry_manager->get_geometry_data_array()); data_manager->allocateMaterialDataArray(geometry_manager->get_material_data_array_front(), geometry_manager->get_material_data_array_back()); - + if (m_verbose) { std::cout << "Time to compute AABB: " << AABB_timer.get_time_sec() << " seconds" << std::endl; @@ -237,7 +245,6 @@ void SolTraceSystem::initialize() print_launch_params(); } - data_manager->allocateLaunchParams(); m_timer_setup.stop(); @@ -275,10 +282,10 @@ void SolTraceSystem::run() int height = data_manager->launch_params_H.height; size_t m_mem_free_after; - size_t mem_total; + size_t mem_total; cudaMemGetInfo(&m_mem_free_after, &mem_total); - if(m_verbose) + if (m_verbose) std::cout << "Memory used by launch: " << (m_mem_free_before - m_mem_free_after) / (1024.0 * 1024.0) << " MB\n"; // Launch the simulation. @@ -322,21 +329,21 @@ void SolTraceSystem::run() if (m_verbose) { - const double t_setup = timer_setup_buffer.get_time_sec(); - const double t_launch = timer_optix_launch.get_time_sec(); + const double t_setup = timer_setup_buffer.get_time_sec(); + const double t_launch = timer_optix_launch.get_time_sec(); const double t_collect = timer_collect_results.get_time_sec(); - const double t_total = t_setup + t_launch + t_collect; - const double inv_n = n_iterations > 0 ? 1.0 / static_cast(n_iterations) : 0.0; + const double t_total = t_setup + t_launch + t_collect; + const double inv_n = n_iterations > 0 ? 1.0 / static_cast(n_iterations) : 0.0; std::cout << "\n--- SolTraceSystem::run() timing (" << n_iterations << " iteration" << (n_iterations == 1 ? "" : "s") << ") ---\n"; std::cout << std::fixed << std::setprecision(6); - std::cout << " setup_device_buffer : total = " << t_setup << " s" - << " avg = " << t_setup * inv_n << " s" - << " fraction = " << (t_total > 0.0 ? 100.0 * t_setup / t_total : 0.0) << " %\n"; - std::cout << " optixLaunch : total = " << t_launch << " s" - << " avg = " << t_launch * inv_n << " s" - << " fraction = " << (t_total > 0.0 ? 100.0 * t_launch / t_total : 0.0) << " %\n"; + std::cout << " setup_device_buffer : total = " << t_setup << " s" + << " avg = " << t_setup * inv_n << " s" + << " fraction = " << (t_total > 0.0 ? 100.0 * t_setup / t_total : 0.0) << " %\n"; + std::cout << " optixLaunch : total = " << t_launch << " s" + << " avg = " << t_launch * inv_n << " s" + << " fraction = " << (t_total > 0.0 ? 100.0 * t_launch / t_total : 0.0) << " %\n"; std::cout << " get_buffer_results : total = " << t_collect << " s" << " avg = " << t_collect * inv_n << " s" << " fraction = " << (t_total > 0.0 ? 100.0 * t_collect / t_total : 0.0) << " %\n"; @@ -349,18 +356,20 @@ void SolTraceSystem::update() { const int N_slots = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth; - const size_t hit_point_buffer_size = N_slots * sizeof(float4); - const size_t element_id_size = N_slots * sizeof(int32_t); - const size_t hit_type_buffer_size = N_slots * sizeof(uint8_t); + // const size_t hit_point_buffer_size = N_slots * sizeof(float4); + // const size_t element_id_size = N_slots * sizeof(int32_t); + // const size_t hit_type_buffer_size = N_slots * sizeof(uint8_t); + const size_t hit_buffer_size = N_slots * sizeof(HitRecord); // update aabb and sun plane accordingly geometry_manager->update_geometry_info(m_element_list, data_manager->launch_params_H); // update data on the device data_manager->updateGeometryDataArray(geometry_manager->get_geometry_data_array()); - CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_point_buffer, 0, hit_point_buffer_size)); - CUDA_CHECK(cudaMemset(data_manager->launch_params_H.element_id_buffer, kElementIdBuffer, element_id_size)); - CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_type_buffer, HitType::HIT_UNASSIGNED, hit_type_buffer_size)); + // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_point_buffer, 0, hit_point_buffer_size)); + // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.element_id_buffer, kElementIdBuffer, element_id_size)); + // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_type_buffer, HitType::HIT_UNASSIGNED, hit_type_buffer_size)); + CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_buffer, 0, hit_buffer_size)); data_manager->updateLaunchParams(); } @@ -409,28 +418,35 @@ void SolTraceSystem::clean_up() CUDA_CHECK(cudaFree(reinterpret_cast(m_state.d_gas_output_buffer))); // Free device-side launch parameter memory - CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_point_buffer))); - CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.element_id_buffer))); - CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_type_buffer))); + // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_point_buffer))); + // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.element_id_buffer))); + // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_type_buffer))); + CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_buffer))); CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.sun_dir_buffer))); - data_manager->launch_params_H.hit_point_buffer = nullptr; - data_manager->launch_params_H.element_id_buffer = nullptr; - data_manager->launch_params_H.hit_type_buffer = nullptr; + // data_manager->launch_params_H.hit_point_buffer = nullptr; + // data_manager->launch_params_H.element_id_buffer = nullptr; + // data_manager->launch_params_H.hit_type_buffer = nullptr; + data_manager->launch_params_H.hit_buffer = nullptr; data_manager->launch_params_H.sun_dir_buffer = nullptr; - m_hit_point_buffer_size_allocated = 0; - m_element_id_buffer_size_allocated = 0; - m_hit_type_buffer_size_allocated = 0; + // m_hit_point_buffer_size_allocated = 0; + // m_element_id_buffer_size_allocated = 0; + // m_hit_type_buffer_size_allocated = 0; + m_hit_buffer_size_allocated = 0; m_sun_dir_buffer_size_allocated = 0; data_manager->cleanup(); - m_hp_output_buffer_host.clear(); - m_hp_output_buffer_host.shrink_to_fit(); - m_element_id_buffer_host.clear(); - m_element_id_buffer_host.shrink_to_fit(); - m_hit_type_buffer_host.clear(); - m_hit_type_buffer_host.shrink_to_fit(); + CUDA_CHECK(cudaFreeHost(reinterpret_cast(m_hit_buffer_host))); + m_hit_buffer_host = nullptr; + m_hit_buffer_host_capacity = 0; + + // m_hp_output_buffer_host.clear(); + // m_hp_output_buffer_host.shrink_to_fit(); + // m_element_id_buffer_host.clear(); + // m_element_id_buffer_host.shrink_to_fit(); + // m_hit_type_buffer_host.clear(); + // m_hit_type_buffer_host.shrink_to_fit(); m_state.context = nullptr; m_state.stream = nullptr; @@ -456,9 +472,9 @@ void SolTraceSystem::reset() m_hit_type_vec.clear(); m_sunraynumber_vec.clear(); - m_hp_output_buffer_host.clear(); - m_element_id_buffer_host.clear(); - m_hit_type_buffer_host.clear(); + // m_hp_output_buffer_host.clear(); + // m_element_id_buffer_host.clear(); + // m_hit_type_buffer_host.clear(); m_sun = nullptr; m_number_of_rays = 0; @@ -615,37 +631,47 @@ void SolTraceSystem::setup_device_buffer() data_manager->launch_params_H.height = 1; data_manager->launch_params_H.max_depth = MAX_TRACE_DEPTH; - const size_t hit_point_buffer_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(float4) * data_manager->launch_params_H.max_depth; - const size_t element_id_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(int32_t) * data_manager->launch_params_H.max_depth; - const size_t hit_type_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(uint8_t) * data_manager->launch_params_H.max_depth; + // const size_t hit_point_buffer_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(float4) * data_manager->launch_params_H.max_depth; + // const size_t element_id_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(int32_t) * data_manager->launch_params_H.max_depth; + // const size_t hit_type_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(uint8_t) * data_manager->launch_params_H.max_depth; + const size_t hit_buffer_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth * sizeof(HitRecord); const size_t sun_dir_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(float3); - if (data_manager->launch_params_H.hit_point_buffer == nullptr || m_hit_point_buffer_size_allocated != hit_point_buffer_size) - { - if (data_manager->launch_params_H.hit_point_buffer != nullptr) - CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_point_buffer))); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.hit_point_buffer), hit_point_buffer_size)); - m_hit_point_buffer_size_allocated = hit_point_buffer_size; - } - CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_point_buffer, 0, hit_point_buffer_size)); - - if (data_manager->launch_params_H.element_id_buffer == nullptr || m_element_id_buffer_size_allocated != element_id_size) + // if (data_manager->launch_params_H.hit_point_buffer == nullptr || m_hit_point_buffer_size_allocated != hit_point_buffer_size) + // { + // if (data_manager->launch_params_H.hit_point_buffer != nullptr) + // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_point_buffer))); + // CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.hit_point_buffer), hit_point_buffer_size)); + // m_hit_point_buffer_size_allocated = hit_point_buffer_size; + // } + // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_point_buffer, 0, hit_point_buffer_size)); + + // if (data_manager->launch_params_H.element_id_buffer == nullptr || m_element_id_buffer_size_allocated != element_id_size) + // { + // if (data_manager->launch_params_H.element_id_buffer != nullptr) + // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.element_id_buffer))); + // CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.element_id_buffer), element_id_size)); + // m_element_id_buffer_size_allocated = element_id_size; + // } + // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.element_id_buffer, kElementIdBuffer, element_id_size)); + + // if (data_manager->launch_params_H.hit_type_buffer == nullptr || m_hit_type_buffer_size_allocated != hit_type_size) + // { + // if (data_manager->launch_params_H.hit_type_buffer != nullptr) + // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_type_buffer))); + // CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.hit_type_buffer), hit_type_size)); + // m_hit_type_buffer_size_allocated = hit_type_size; + // } + // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_type_buffer, HitType::HIT_UNASSIGNED, hit_type_size)); + + if (data_manager->launch_params_H.hit_buffer == nullptr || m_hit_buffer_size_allocated != hit_buffer_size) { - if (data_manager->launch_params_H.element_id_buffer != nullptr) - CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.element_id_buffer))); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.element_id_buffer), element_id_size)); - m_element_id_buffer_size_allocated = element_id_size; + if (data_manager->launch_params_H.hit_buffer != nullptr) + CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_buffer))); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.hit_buffer), hit_buffer_size)); + m_hit_buffer_size_allocated = hit_buffer_size; } - CUDA_CHECK(cudaMemset(data_manager->launch_params_H.element_id_buffer, kElementIdBuffer, element_id_size)); - - if (data_manager->launch_params_H.hit_type_buffer == nullptr || m_hit_type_buffer_size_allocated != hit_type_size) - { - if (data_manager->launch_params_H.hit_type_buffer != nullptr) - CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_type_buffer))); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.hit_type_buffer), hit_type_size)); - m_hit_type_buffer_size_allocated = hit_type_size; - } - CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_type_buffer, HitType::HIT_UNASSIGNED, hit_type_size)); + CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_buffer, 0, hit_buffer_size)); if (data_manager->launch_params_H.sun_dir_buffer == nullptr || m_sun_dir_buffer_size_allocated != sun_dir_size) { @@ -674,18 +700,26 @@ void SolTraceSystem::get_buffer_results(std::vector &hp_vec, std::vector { const int max_depth = data_manager->launch_params_H.max_depth; const int num_rays = data_manager->launch_params_H.width * data_manager->launch_params_H.height; - const int output_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth; - - if (static_cast(m_hp_output_buffer_host.size()) != output_size) - m_hp_output_buffer_host.resize(output_size); - if (static_cast(m_element_id_buffer_host.size()) != output_size) - m_element_id_buffer_host.resize(output_size); - if (static_cast(m_hit_type_buffer_host.size()) != output_size) - m_hit_type_buffer_host.resize(output_size); + // const int output_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth; + const uint_fast64_t output_size = num_rays * max_depth; + + // if (static_cast(m_hp_output_buffer_host.size()) != output_size) + // m_hp_output_buffer_host.resize(output_size); + // if (static_cast(m_element_id_buffer_host.size()) != output_size) + // m_element_id_buffer_host.resize(output_size); + // if (static_cast(m_hit_type_buffer_host.size()) != output_size) + // m_hit_type_buffer_host.resize(output_size); + if (m_hit_buffer_host_capacity != output_size) + { + CUDA_CHECK(cudaFreeHost(reinterpret_cast(m_hit_buffer_host))); + CUDA_CHECK(cudaMallocHost(reinterpret_cast(&m_hit_buffer_host), output_size * sizeof(HitRecord))); + m_hit_buffer_host_capacity = output_size; + } - CUDA_CHECK(cudaMemcpy(m_hp_output_buffer_host.data(), data_manager->launch_params_H.hit_point_buffer, output_size * sizeof(float4), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(m_element_id_buffer_host.data(), data_manager->launch_params_H.element_id_buffer, output_size * sizeof(int32_t), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(m_hit_type_buffer_host.data(), data_manager->launch_params_H.hit_type_buffer, output_size * sizeof(uint8_t), cudaMemcpyDeviceToHost)); + // CUDA_CHECK(cudaMemcpy(m_hp_output_buffer_host.data(), data_manager->launch_params_H.hit_point_buffer, output_size * sizeof(float4), cudaMemcpyDeviceToHost)); + // CUDA_CHECK(cudaMemcpy(m_element_id_buffer_host.data(), data_manager->launch_params_H.element_id_buffer, output_size * sizeof(int32_t), cudaMemcpyDeviceToHost)); + // CUDA_CHECK(cudaMemcpy(m_hit_type_buffer_host.data(), data_manager->launch_params_H.hit_type_buffer, output_size * sizeof(uint8_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(m_hit_buffer_host, data_manager->launch_params_H.hit_buffer, output_size * sizeof(HitRecord), cudaMemcpyDeviceToHost)); // Loop through each buffer slot uint_fast64_t ray_number = raynumber_vec.empty() ? 0 : raynumber_vec.back(); @@ -694,7 +728,9 @@ void SolTraceSystem::get_buffer_results(std::vector &hp_vec, std::vector { // Get hit type - const uint8_t &hit_type = m_hit_type_buffer_host[i]; + // const uint8_t &hit_type = m_hit_type_buffer_host[i]; + const HitRecord &hr = m_hit_buffer_host[i]; + const uint8_t &hit_type = hr.hit_type; // Skip if empty if (hit_type < HitType::HIT_CREATE || hit_type > HitType::HIT_EXIT) @@ -724,11 +760,13 @@ void SolTraceSystem::get_buffer_results(std::vector &hp_vec, std::vector } // Get hit record, element_id - const float4 &hit_record = m_hp_output_buffer_host[i]; // [depth, pos x, pos y, pos z] - const int32_t &element_id = m_element_id_buffer_host[i]; + // const float4 &hit_record = m_hp_output_buffer_host[i]; // [depth, pos x, pos y, pos z] + // const int32_t &element_id = m_element_id_buffer_host[i]; + const float4 &hit_point = hr.hit_point; + const int32_t &element_id = hr.element_id; // Collect results - hp_vec.push_back(hit_record); + hp_vec.push_back(hit_point); raynumber_vec.push_back(ray_number); hit_type_vec.push_back(hit_type); element_id_vec.push_back(element_id); diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index 31f4309b..7e2924b8 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -1,22 +1,21 @@ #pragma once +#include +#include +#include #include #include -#include -#include -#include - - #include "core/soltrace_state.h" // SoltraceState -#include "core/vec3d.h" // Vec3d +#include "core/vec3d.h" // Vec3d #include "core/timer.h" #include "core/CspElement.h" // CspElement #include "core/Surface.h" // Surface and derived classes #include "../../../../../simulation_data/simulation_data_export.hpp" -namespace OptixCSP { +namespace OptixCSP +{ class GeometryManager; class pipelineManager; @@ -25,16 +24,17 @@ namespace OptixCSP { class Vec3d; class Surface; + struct HitRecord; + static constexpr SolTrace::Data::SunShape kSupportedSunshapes[] = { - SolTrace::Data::SunShape::GAUSSIAN, - SolTrace::Data::SunShape::PILLBOX, - SolTrace::Data::SunShape::BUIE_CSR - }; + SolTrace::Data::SunShape::GAUSSIAN, + SolTrace::Data::SunShape::PILLBOX, + SolTrace::Data::SunShape::BUIE_CSR}; - class SolTraceSystem { + class SolTraceSystem + { public: - SolTraceSystem(); ~SolTraceSystem(); @@ -48,8 +48,8 @@ namespace OptixCSP { void update(); // Get all hit points - void get_hp_output(std::vector& hp_vec, std::vector& raynumber_vec, std::vector& element_id_vec, - std::vector& hit_type_vec); + void get_hp_output(std::vector &hp_vec, std::vector &raynumber_vec, std::vector &element_id_vec, + std::vector &hit_type_vec); /// Explicit cleanup void clean_up(); @@ -64,14 +64,14 @@ namespace OptixCSP { /// /// void set_number_of_rays(uint_fast64_t nrays, uint_fast64_t maxrays) - { + { m_number_of_rays = nrays; m_max_number_of_rays = maxrays; } - void set_sun(SolTrace::Data::Sun* sun) { m_sun = sun; } + void set_sun(SolTrace::Data::Sun *sun) { m_sun = sun; } - void set_seed(uint64_t seed) { m_seed = seed; } // Set sun seed + void set_seed(uint64_t seed) { m_seed = seed; } // Set sun seed void set_optical_errors(bool include_optical_errors) { @@ -94,23 +94,20 @@ namespace OptixCSP { /// double get_sun_plane_area() const; - uint_fast64_t get_N_sun_rays() - { + uint_fast64_t get_N_sun_rays() + { if (m_sunraynumber_vec.empty()) return 0; - return m_sunraynumber_vec.back(); + return m_sunraynumber_vec.back(); } std::vector get_sunraynumber_vec() const { return m_sunraynumber_vec; } void set_sun_shape_errors(bool flag) { this->m_include_sun_shape_errors = flag; } - - private: - std::shared_ptr geometry_manager; std::shared_ptr pipeline_manager; - std::shared_ptr data_manager; + std::shared_ptr data_manager; uint_fast64_t m_number_of_rays; uint_fast64_t m_max_number_of_rays; @@ -118,12 +115,11 @@ namespace OptixCSP { bool m_verbose; // Sun - //OptixCSP::Vec3d m_sun_vector; - //double m_sun_angle; - - SolTrace::Data::Sun* m_sun; - bool m_include_sun_shape_errors = false; + // OptixCSP::Vec3d m_sun_vector; + // double m_sun_angle; + SolTrace::Data::Sun *m_sun; + bool m_include_sun_shape_errors = false; uint64_t m_seed = 123456ULL; bool m_optical_errors; @@ -136,25 +132,30 @@ namespace OptixCSP { std::vector m_raynumber_vec; std::vector m_element_id_vec; std::vector m_hit_type_vec; - std::vector m_sunraynumber_vec; // This is ID of hit rays out of all generated rays + std::vector m_sunraynumber_vec; // This is ID of hit rays out of all generated rays // Reused host-side scratch buffers for copying launch results back from device. - std::vector m_hp_output_buffer_host; - std::vector m_element_id_buffer_host; - std::vector m_hit_type_buffer_host; + // Allocated with cudaMallocHost, deallocated with cudaFreeHost resulting in using + // page-locked memory for faster transfers between device and host. + HitRecord *m_hit_buffer_host; + uint_fast64_t m_hit_buffer_host_capacity; + // std::vector m_hp_output_buffer_host; + // std::vector m_element_id_buffer_host; + // std::vector m_hit_type_buffer_host; // Current allocated device launch buffer sizes. - size_t m_hit_point_buffer_size_allocated = 0; - size_t m_element_id_buffer_size_allocated = 0; - size_t m_hit_type_buffer_size_allocated = 0; + // size_t m_hit_point_buffer_size_allocated = 0; + // size_t m_element_id_buffer_size_allocated = 0; + // size_t m_hit_type_buffer_size_allocated = 0; + size_t m_hit_buffer_size_allocated = 0; size_t m_sun_dir_buffer_size_allocated = 0; std::vector> m_element_list; void create_shader_binding_table(); void setup_device_buffer(); - void get_buffer_results(std::vector& hp_vec, std::vector& raynumber_vec, - std::vector& element_id_vec, std::vector& hit_type_vec, - std::vector& sunraynumber_vec); + void get_buffer_results(std::vector &hp_vec, std::vector &raynumber_vec, + std::vector &element_id_vec, std::vector &hit_type_vec, + std::vector &sunraynumber_vec); Timer m_timer_setup; Timer m_timer_trace; @@ -162,7 +163,5 @@ namespace OptixCSP { // memory usage size_t m_mem_free_before; size_t m_mem_free_after; - - }; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h index 8eefec4d..ebcb14f7 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h @@ -19,6 +19,13 @@ namespace OptixCSP{ MaterialData material_data; }; + struct HitRecord { + float4 hit_point; + int32_t element_id; + uint8_t hit_type; + uint8_t _pad[3]; // TODO: Is this necessary? + }; + enum RayType { RAY_TYPE_RADIANCE = 0, @@ -46,12 +53,14 @@ namespace OptixCSP{ int max_depth; unsigned int ray_offset; // Global offset for current branch - float4* hit_point_buffer; + // float4* hit_point_buffer; + HitRecord* hit_buffer; float3* sun_dir_buffer; curandState* rng_states; OptixTraversableHandle handle; - int32_t* element_id_buffer; - uint8_t* hit_type_buffer; + // int32_t* element_id_buffer; + // uint8_t* hit_type_buffer; + float3 sun_vector; bool include_sun_shape_errors; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu index 0c896c55..3eb55621 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu @@ -244,14 +244,14 @@ extern "C" __global__ void __closesthit__element() const int slot = params.max_depth * prd.ray_path_index + new_depth; // Store the hit point in the hit point buffer (used for visualization or further calculations) - params.hit_point_buffer[slot] = make_float4(new_depth, hit_point); + params.hit_buffer[slot].hit_point = make_float4(new_depth, hit_point); // Store element id const int32_t elementId = params.geometry_data_array[optixGetPrimitiveIndex()].id; - params.element_id_buffer[slot] = elementId; + params.hit_buffer[slot].element_id = elementId; // Store hit type - params.hit_type_buffer[slot] = hit_type; + params.hit_buffer[slot].hit_type = hit_type; // Store the reflected direction in its buffer (used for visualization or further calculations) /* diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu index f6252ad0..5d00d0b4 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu @@ -239,9 +239,12 @@ extern "C" __global__ void __raygen__sun_source() prd.depth = 0; // TODO make this a launch parameter - params.hit_point_buffer[params.max_depth * prd.ray_path_index] = make_float4(0.0f, ray_gen_pos); - params.element_id_buffer[params.max_depth * prd.ray_path_index] = OptixCSP::kElementIdRayGen; - params.hit_type_buffer[params.max_depth * prd.ray_path_index] = OptixCSP::HitType::HIT_CREATE; + // params.hit_point_buffer[params.max_depth * prd.ray_path_index] = make_float4(0.0f, ray_gen_pos); + // params.element_id_buffer[params.max_depth * prd.ray_path_index] = OptixCSP::kElementIdRayGen; + // params.hit_type_buffer[params.max_depth * prd.ray_path_index] = OptixCSP::HitType::HIT_CREATE; + params.hit_buffer[params.max_depth * prd.ray_path_index].hit_point = make_float4(0.0f, ray_gen_pos); + params.hit_buffer[params.max_depth * prd.ray_path_index].element_id = OptixCSP::kElementIdRayGen; + params.hit_buffer[params.max_depth * prd.ray_path_index].hit_type = OptixCSP::HitType::HIT_CREATE; params.sun_dir_buffer[prd.ray_path_index] = ray_dir; From 8db1a6aa6477222437680d0c184dc727e10c692c Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 12 May 2026 14:30:31 -0600 Subject: [PATCH 17/60] Update host side launch params --- .../optix_runner/OptixCSP/src/core/data_manager.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp index e16a70c3..3e93a66f 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp @@ -26,11 +26,12 @@ dataManager::dataManager() launch_params_H.max_depth = 5; launch_params_H.ray_offset = 0; - launch_params_H.hit_point_buffer = nullptr; + // launch_params_H.hit_point_buffer = nullptr; + launch_params_H.hit_buffer = nullptr; launch_params_H.sun_dir_buffer = nullptr; launch_params_H.rng_states = nullptr; - launch_params_H.element_id_buffer = nullptr; - launch_params_H.hit_type_buffer = nullptr; + // launch_params_H.element_id_buffer = nullptr; + // launch_params_H.hit_type_buffer = nullptr; launch_params_H.sun_vector = make_float3(0.0f, 0.0f, 10.0f); launch_params_H.sun_shape = OptixCSP::SunShape::UNKNOWN; launch_params_H.include_sun_shape_errors = false; From 75cb2b7b51fe5704fcd98872da70bb6f5c208c03 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 12 May 2026 15:42:55 -0600 Subject: [PATCH 18/60] Rework the loop in the SolTraceSystem::get_buffer_results function loop to leverage the structure better --- .../OptixCSP/src/core/soltrace_system.cpp | 162 ++++++++++++------ .../OptixCSP/src/shaders/Soltrace.h | 3 + 2 files changed, 114 insertions(+), 51 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 05c30567..c764566f 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -698,8 +698,8 @@ void SolTraceSystem::get_buffer_results(std::vector &hp_vec, std::vector std::vector &element_id_vec, std::vector &hit_type_vec, std::vector &sunraynumber_vec) { - const int max_depth = data_manager->launch_params_H.max_depth; - const int num_rays = data_manager->launch_params_H.width * data_manager->launch_params_H.height; + const uint_fast64_t max_depth = data_manager->launch_params_H.max_depth; + const uint_fast64_t num_rays = data_manager->launch_params_H.width * data_manager->launch_params_H.height; // const int output_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth; const uint_fast64_t output_size = num_rays * max_depth; @@ -721,67 +721,127 @@ void SolTraceSystem::get_buffer_results(std::vector &hp_vec, std::vector // CUDA_CHECK(cudaMemcpy(m_hit_type_buffer_host.data(), data_manager->launch_params_H.hit_type_buffer, output_size * sizeof(uint8_t), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(m_hit_buffer_host, data_manager->launch_params_H.hit_buffer, output_size * sizeof(HitRecord), cudaMemcpyDeviceToHost)); - // Loop through each buffer slot + // Loop through each ray uint_fast64_t ray_number = raynumber_vec.empty() ? 0 : raynumber_vec.back(); uint_fast64_t sunray_number = sunraynumber_vec.empty() ? 0 : sunraynumber_vec.back(); - for (int i = 0; i < output_size; ++i) + HitRecord pending_record; + bool pending_create = false; + for (uint_fast64_t ray = 0; ray < num_rays; ++ray) { - - // Get hit type - // const uint8_t &hit_type = m_hit_type_buffer_host[i]; - const HitRecord &hr = m_hit_buffer_host[i]; - const uint8_t &hit_type = hr.hit_type; - - // Skip if empty - if (hit_type < HitType::HIT_CREATE || hit_type > HitType::HIT_EXIT) + // bool ray_hit_something = false; + for (uint_fast64_t depth = 0; depth < max_depth; ++depth) { - continue; - } + uint_fast64_t idx = max_depth * ray + depth; + const HitRecord &hr = m_hit_buffer_host[idx]; + const uint8_t &hit_type = hr.hit_type; - // If new ray, check if previous ray hit anything - if (hit_type == HitType::HIT_CREATE) - { - // Remove last ray if it has no hits - if (!hit_type_vec.empty() && hit_type_vec.back() == HitType::HIT_CREATE) + if (hit_type < HitType::HIT_CREATE || hit_type > HitType::HIT_EXIT) { - hp_vec.pop_back(); - raynumber_vec.pop_back(); - hit_type_vec.pop_back(); - element_id_vec.pop_back(); - sunraynumber_vec.pop_back(); - ray_number--; + // Hit end of ray history--go to next ray + break; } - // New ray - ray_number++; - - // Sun ray number always increments, even if no hit - sunray_number++; + if (hit_type == HitType::HIT_CREATE) + { + // New ray -- capture data and wait to see if the ray hit anything + pending_record = hr; + pending_create = true; + sunray_number++; + } + else + { + // Any invalid hit types have already been handled by the first if block + + // Clear the pending data if necessary + if (pending_create) + { + pending_create = false; + ray_number++; + hp_vec.push_back(pending_record.hit_point); + raynumber_vec.push_back(ray_number); + hit_type_vec.push_back(pending_record.hit_type); + element_id_vec.push_back(pending_record.element_id); + sunraynumber_vec.push_back(sunray_number); + } + + hp_vec.push_back(hr.hit_point); + raynumber_vec.push_back(ray_number); + hit_type_vec.push_back(hit_type); + element_id_vec.push_back(hr.element_id); + sunraynumber_vec.push_back(sunray_number); + + if (hit_type == HitType::HIT_ABSORB || hit_type == HitType::HIT_EXIT) + { + // Ray has terminated. Go to the next. + // NOTE: As of this writing, OptixRunner does not mark rays + // with HIT_EXIT. Include it here anyway. + break; + } + } } + } - // Get hit record, element_id - // const float4 &hit_record = m_hp_output_buffer_host[i]; // [depth, pos x, pos y, pos z] - // const int32_t &element_id = m_element_id_buffer_host[i]; - const float4 &hit_point = hr.hit_point; - const int32_t &element_id = hr.element_id; + // // Loop through each buffer slot + // uint_fast64_t ray_number = raynumber_vec.empty() ? 0 : raynumber_vec.back(); + // uint_fast64_t sunray_number = sunraynumber_vec.empty() ? 0 : sunraynumber_vec.back(); + // for (int i = 0; i < output_size; ++i) + // { - // Collect results - hp_vec.push_back(hit_point); - raynumber_vec.push_back(ray_number); - hit_type_vec.push_back(hit_type); - element_id_vec.push_back(element_id); - sunraynumber_vec.push_back(sunray_number); - } + // // Get hit type + // // const uint8_t &hit_type = m_hit_type_buffer_host[i]; + // const HitRecord &hr = m_hit_buffer_host[i]; + // const uint8_t &hit_type = hr.hit_type; + + // // Skip if empty + // if (hit_type < HitType::HIT_CREATE || hit_type > HitType::HIT_EXIT) + // { + // continue; + // } + + // // If new ray, check if previous ray hit anything + // if (hit_type == HitType::HIT_CREATE) + // { + // // Remove last ray if it has no hits + // if (!hit_type_vec.empty() && hit_type_vec.back() == HitType::HIT_CREATE) + // { + // hp_vec.pop_back(); + // raynumber_vec.pop_back(); + // hit_type_vec.pop_back(); + // element_id_vec.pop_back(); + // sunraynumber_vec.pop_back(); + // ray_number--; + // } + + // // New ray + // ray_number++; + + // // Sun ray number always increments, even if no hit + // sunray_number++; + // } + + // // Get hit record, element_id + // // const float4 &hit_record = m_hp_output_buffer_host[i]; // [depth, pos x, pos y, pos z] + // // const int32_t &element_id = m_element_id_buffer_host[i]; + // const float4 &hit_point = hr.hit_point; + // const int32_t &element_id = hr.element_id; + + // // Collect results + // hp_vec.push_back(hit_point); + // raynumber_vec.push_back(ray_number); + // hit_type_vec.push_back(hit_type); + // element_id_vec.push_back(element_id); + // sunraynumber_vec.push_back(sunray_number); + // } - // Remove last ray if it is only CREATE - if (!hit_type_vec.empty() && hit_type_vec.back() == HitType::HIT_CREATE) - { - hp_vec.pop_back(); - raynumber_vec.pop_back(); - element_id_vec.pop_back(); - hit_type_vec.pop_back(); - sunraynumber_vec.pop_back(); - } + // // Remove last ray if it is only CREATE + // if (!hit_type_vec.empty() && hit_type_vec.back() == HitType::HIT_CREATE) + // { + // hp_vec.pop_back(); + // raynumber_vec.pop_back(); + // element_id_vec.pop_back(); + // hit_type_vec.pop_back(); + // sunraynumber_vec.pop_back(); + // } return; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h index ebcb14f7..7457ea56 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h @@ -12,6 +12,9 @@ namespace OptixCSP{ const unsigned int NUM_ATTRIBUTE_VALUES = 4u; const unsigned int NUM_PAYLOAD_VALUES = 2u; + // NOTE: Maximum number of ray interactions in tracing with the geometry is + // MAX_TRACE_DEPTH - 1 (so currently 4). See the end of the function + // __closesthit__element in materials.cu. const unsigned int MAX_TRACE_DEPTH = 5u; struct HitGroupData From 7ebc47988d4bf6061f075c91b2642f1b11a386cc Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 12 May 2026 16:19:55 -0600 Subject: [PATCH 19/60] Add nsight profiling markers on CPU side --- .../optix_runner/OptixCSP/src/CMakeLists.txt | 9 +++++ .../OptixCSP/src/core/soltrace_system.cpp | 33 ++++++++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/CMakeLists.txt b/coretrace/simulation_runner/optix_runner/OptixCSP/src/CMakeLists.txt index 1b9591b7..5fe3c812 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/CMakeLists.txt +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/CMakeLists.txt @@ -64,6 +64,15 @@ target_link_libraries(OptixCSP_core PRIVATE CUDA::cuda_driver CUDA::cudart) +# NVTX3 profiling support (header-only, ships with CUDA Toolkit >= 10.0) +if(TARGET CUDA::nvtx3) + target_link_libraries(OptixCSP_core PRIVATE CUDA::nvtx3) + target_compile_definitions(OptixCSP_core PRIVATE NVTX_ENABLED) + message(STATUS "NVTX3 profiling enabled for OptixCSP_core") +else() + message(STATUS "CUDA::nvtx3 not found; NVTX profiling annotations disabled") +endif() + # have to specify the Optix_INCLUDE directory for the CUDA compiler target_compile_options(OptixCSP_core PRIVATE $<$:--use_fast_math -lineinfo -I"${OptiX_INCLUDE}"> diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index c764566f..78cc9a62 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -22,6 +22,10 @@ #include #include +#ifdef NVTX_ENABLED +#include +#endif + #include #include @@ -36,7 +40,7 @@ typedef Record HitGroupRecord; SolTraceSystem::SolTraceSystem() : m_number_of_rays(0), m_max_number_of_rays(0), - m_verbose(true), + m_verbose(false), m_mem_free_before(0), m_mem_free_after(0), m_optical_errors(false), @@ -252,6 +256,9 @@ void SolTraceSystem::initialize() void SolTraceSystem::run() { +#ifdef NVTX_ENABLED + NVTX3_FUNC_RANGE(); +#endif // Initialize results vectors m_hp_vec.clear(); @@ -275,7 +282,12 @@ void SolTraceSystem::run() // Allocate buffer (sets data_manager->launch_params_H buffer) timer_setup_buffer.start(); - setup_device_buffer(); + { +#ifdef NVTX_ENABLED + nvtx3::scoped_range nvtx_setup{"setup_device_buffer"}; +#endif + setup_device_buffer(); + } timer_setup_buffer.stop(); int width = data_manager->launch_params_H.width; @@ -290,6 +302,10 @@ void SolTraceSystem::run() // Launch the simulation. timer_optix_launch.start(); + { +#ifdef NVTX_ENABLED + nvtx3::scoped_range nvtx_launch{"optixLaunch"}; +#endif OPTIX_CHECK(optixLaunch( m_state.pipeline, m_state.stream, // Assume this stream is properly created. @@ -300,12 +316,18 @@ void SolTraceSystem::run() height, 1)); CUDA_SYNC_CHECK(); + } // nvtx_launch timer_optix_launch.stop(); // Collect results timer_collect_results.start(); - get_buffer_results(m_hp_vec, m_raynumber_vec, m_element_id_vec, m_hit_type_vec, - m_sunraynumber_vec); + { +#ifdef NVTX_ENABLED + nvtx3::scoped_range nvtx_collect{"get_buffer_results"}; +#endif + get_buffer_results(m_hp_vec, m_raynumber_vec, m_element_id_vec, m_hit_type_vec, + m_sunraynumber_vec); + } timer_collect_results.stop(); N_ray_hit = m_raynumber_vec.empty() ? 0 : m_raynumber_vec.back(); @@ -698,6 +720,9 @@ void SolTraceSystem::get_buffer_results(std::vector &hp_vec, std::vector std::vector &element_id_vec, std::vector &hit_type_vec, std::vector &sunraynumber_vec) { +#ifdef NVTX_ENABLED + NVTX3_FUNC_RANGE(); +#endif const uint_fast64_t max_depth = data_manager->launch_params_H.max_depth; const uint_fast64_t num_rays = data_manager->launch_params_H.width * data_manager->launch_params_H.height; // const int output_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth; From 2f4738ada9f9057342e286bf0ee7dcd81ebedd68 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 13 May 2026 13:26:14 -0600 Subject: [PATCH 20/60] Add verbose flag to simdriver --- coretrace/simdriver/main.cpp | 9 ++++++++ .../optix_runner/optix_runner.cpp | 22 +++++++++---------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/coretrace/simdriver/main.cpp b/coretrace/simdriver/main.cpp index 14ff41e4..29fb2890 100644 --- a/coretrace/simdriver/main.cpp +++ b/coretrace/simdriver/main.cpp @@ -19,6 +19,7 @@ * --optix Use the OptiX runner (only available if built with * SOLTRACE_BUILD_OPTIX_SUPPORT=ON; falls back to native * runner with a warning if OptiX support is absent) + * --verbose Enable verbose logging in the OptiX runner */ #include @@ -64,6 +65,7 @@ static void print_usage(const char *prog) << " --optix Use OptiX runner instead of the native runner\n" << " (requires SOLTRACE_BUILD_OPTIX_SUPPORT=ON at build time)\n" #endif + << " --verbose Enable verbose logging in the OptiX runner\n" ; } @@ -101,6 +103,7 @@ int main(int argc, char *argv[]) long long num_rays_override = -1; // -1 means use what the JSON specifies bool use_embree = false; bool use_optix = false; + bool verbose = false; // Start parsing options from argv[2] if skip_output, else from argv[3] const int opts_start = skip_output ? 2 : 3; @@ -163,6 +166,10 @@ int main(int argc, char *argv[]) { use_optix = true; } + else if (arg == "--verbose") + { + verbose = true; + } else { std::cerr << "Error: unknown option '" << arg << "'\n"; @@ -313,6 +320,8 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } + runner.get_optix_system()->set_verbose(verbose); + std::cout << "Using OptiX runner\n"; std::cout << "Setting up simulation...\n"; diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index 552d3a24..ede2aff8 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -51,8 +51,6 @@ RunnerStatus OptixRunner::setup_simulation(const SimulationData *data) m_sys.initialize(); - - // std::cout << "Number of stages: " << this->tsys.StageList.size() // << std::endl; @@ -124,10 +122,10 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) auto optix_el = std::make_shared(); auto origin = el->get_origin_global(); - auto ap = el->get_aim_vector_global(); + auto ap = el->get_aim_vector_global(); OptixCSP::Vec3d origin_vec(origin.x, origin.y, origin.z); optix_el->set_origin(ToVec3d(origin)); - optix_el->set_aim_point(ToVec3d(ap)); + optix_el->set_aim_point(ToVec3d(ap)); optix_el->set_rotation_matrix(ToMatrix33d(el->get_local_to_global())); // Safely narrow element id to int32_t @@ -230,7 +228,7 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) } auto soltrace_aperture_type = el->get_aperture()->get_type(); - + switch (soltrace_aperture_type) { case ApertureType::RECTANGLE: @@ -239,11 +237,11 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) assert(el_aperture != nullptr); // TODO: account for x and y coord? // auto aperture = std::make_shared(el_aperture->x_length(), - // el_aperture->y_length()); - auto aperture = std::make_shared(el_aperture->x_length(), - el_aperture->y_length(), - el_aperture->x_coord(), - el_aperture->y_coord()); + // el_aperture->y_length()); + auto aperture = std::make_shared(el_aperture->x_length(), + el_aperture->y_length(), + el_aperture->x_coord(), + el_aperture->y_coord()); optix_el->set_aperture(aperture); break; } @@ -423,7 +421,7 @@ RunnerStatus OptixRunner::report_simulation(SimulationResult *result, // Collect results for record raynum = raynumber_vec[ii]; glm::dvec3 pos(hp_vec[ii].y, hp_vec[ii].z, hp_vec[ii].w); // x is depth - glm::dvec3 cos(0.0); // TODO: calculate directions + glm::dvec3 cos(0.0); // TODO: calculate directions int32_t element_id = element_id_vec[ii]; uint8_t hit_type = hit_type_vec[ii]; SolTrace::Result::RayEvent rev = hit_type_to_ray_event(static_cast(hit_type)); @@ -470,7 +468,7 @@ OptixCSP::Vec3d OptixRunner::ToVec3d(glm::dvec3 v) return vec; } -OptixCSP::Matrix33d OptixRunner::ToMatrix33d(const glm::dmat3& mat) +OptixCSP::Matrix33d OptixRunner::ToMatrix33d(const glm::dmat3 &mat) { return OptixCSP::Matrix33d( mat[0][0], mat[1][0], mat[2][0], From 7802ff9d4391ff838a2a2d8e1cdc8640b0da31dc Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 13 May 2026 13:33:56 -0600 Subject: [PATCH 21/60] Better method to set verbose flag --- coretrace/simdriver/main.cpp | 2 +- coretrace/simulation_runner/optix_runner/optix_runner.hpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/coretrace/simdriver/main.cpp b/coretrace/simdriver/main.cpp index 29fb2890..01318278 100644 --- a/coretrace/simdriver/main.cpp +++ b/coretrace/simdriver/main.cpp @@ -320,7 +320,7 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } - runner.get_optix_system()->set_verbose(verbose); + runner.set_verbose(verbose); std::cout << "Using OptiX runner\n"; diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.hpp b/coretrace/simulation_runner/optix_runner/optix_runner.hpp index 7f4054dc..5124eff7 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.hpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.hpp @@ -34,6 +34,8 @@ class OptixRunner : public SolTrace::Runner::SimulationRunner uint_fast64_t get_N_sun_rays() { return m_sys.get_N_sun_rays(); } + void set_verbose(bool verbose) { m_sys.set_verbose(verbose); } + // Runner options // void disable_sun_shape_errors() { this->include_sun_shape_errors = false; } // void enable_sun_shape_errors() { this->include_sun_shape_errors = true; } From 6ce9bb77633c77c7d67017a7584b0c1f5b21614f Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 13 May 2026 13:37:06 -0600 Subject: [PATCH 22/60] Fix linking error --- coretrace/simulation_runner/optix_runner/optix_runner.cpp | 5 +++++ coretrace/simulation_runner/optix_runner/optix_runner.hpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index ede2aff8..22fe19e5 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -20,6 +20,11 @@ OptixRunner::~OptixRunner() this->m_sys.clean_up(); } +void OptixRunner::set_verbose(bool verbose) +{ + m_sys.set_verbose(verbose); +} + RunnerStatus OptixRunner::initialize() { // add elements to sys using data structure from SimulationData diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.hpp b/coretrace/simulation_runner/optix_runner/optix_runner.hpp index 5124eff7..4fd7f8eb 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.hpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.hpp @@ -34,7 +34,7 @@ class OptixRunner : public SolTrace::Runner::SimulationRunner uint_fast64_t get_N_sun_rays() { return m_sys.get_N_sun_rays(); } - void set_verbose(bool verbose) { m_sys.set_verbose(verbose); } + void set_verbose(bool verbose); // Runner options // void disable_sun_shape_errors() { this->include_sun_shape_errors = false; } From cda9a8988ab99337171e0caf777e0894237a2ab9 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 13 May 2026 14:10:06 -0600 Subject: [PATCH 23/60] Add more timing information to the OptixRunner --- coretrace/simdriver/main.cpp | 5 + .../OptixCSP/src/core/soltrace_system.cpp | 141 ++++++++++++++---- .../OptixCSP/src/core/soltrace_system.h | 19 +++ .../optix_runner/optix_runner.cpp | 5 + .../optix_runner/optix_runner.hpp | 2 + 5 files changed, 139 insertions(+), 33 deletions(-) diff --git a/coretrace/simdriver/main.cpp b/coretrace/simdriver/main.cpp index 01318278..11d02cbb 100644 --- a/coretrace/simdriver/main.cpp +++ b/coretrace/simdriver/main.cpp @@ -369,6 +369,11 @@ int main(int argc, char *argv[]) { std::cout << "Skipping result retrieval (--no-output).\n"; } + + if (!verbose) + { + runner.print_timing(); + } } else #endif diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 78cc9a62..798b1a92 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -49,6 +49,16 @@ SolTraceSystem::SolTraceSystem() m_include_sun_shape_errors(false), m_timer_setup(), m_timer_trace(), + m_timer_aabb(), + m_timer_geometry(), + m_timer_pipeline(), + m_timer_sbt(), + m_timer_setup_buffer(), + m_timer_optix_launch(), + m_timer_collect_results(), + m_timer_memcpy(), + m_timer_host_processing(), + m_n_run_iterations(0), geometry_manager(std::make_shared(m_state, m_verbose)), data_manager(std::make_shared()), pipeline_manager(std::make_shared(m_state)), @@ -206,26 +216,26 @@ void SolTraceSystem::initialize() data_manager->launch_params_H.sun_max_intensity = static_cast(m_sun->get_max_intensity()); } - Timer AABB_timer; - AABB_timer.start(); + m_timer_aabb.reset(); + m_timer_aabb.start(); geometry_manager->collect_geometry_info(m_element_list, data_manager->launch_params_H); - AABB_timer.stop(); + m_timer_aabb.stop(); - Timer geometry_timer; - geometry_timer.start(); + m_timer_geometry.reset(); + m_timer_geometry.start(); geometry_manager->create_geometries(data_manager->launch_params_H); - geometry_timer.stop(); + m_timer_geometry.stop(); // Pipeline setup. - Timer pipeline_timer; - pipeline_timer.start(); + m_timer_pipeline.reset(); + m_timer_pipeline.start(); pipeline_manager->createPipeline(); - pipeline_timer.stop(); + m_timer_pipeline.stop(); - Timer sbt_timer; - sbt_timer.start(); + m_timer_sbt.reset(); + m_timer_sbt.start(); create_shader_binding_table(); - sbt_timer.stop(); + m_timer_sbt.stop(); // seed for randomization data_manager->launch_params_H.sun_dir_seed = m_seed; @@ -242,10 +252,10 @@ void SolTraceSystem::initialize() if (m_verbose) { - std::cout << "Time to compute AABB: " << AABB_timer.get_time_sec() << " seconds" << std::endl; - std::cout << "Time to create geometries: " << geometry_timer.get_time_sec() << " seconds" << std::endl; - std::cout << "Time to create pipeline: " << pipeline_timer.get_time_sec() << " seconds" << std::endl; - std::cout << "Time to create SBT: " << sbt_timer.get_time_sec() << " seconds" << std::endl; + std::cout << "Time to compute AABB: " << m_timer_aabb.get_time_sec() << " seconds" << std::endl; + std::cout << "Time to create geometries: " << m_timer_geometry.get_time_sec() << " seconds" << std::endl; + std::cout << "Time to create pipeline: " << m_timer_pipeline.get_time_sec() << " seconds" << std::endl; + std::cout << "Time to create SBT: " << m_timer_sbt.get_time_sec() << " seconds" << std::endl; print_launch_params(); } @@ -268,27 +278,31 @@ void SolTraceSystem::run() uint_fast64_t N_ray_hit = 0; uint_fast64_t N_ray_gen = 0; - Timer timer_setup_buffer; - Timer timer_optix_launch; - Timer timer_collect_results; - uint64_t n_iterations = 0; + m_timer_trace.reset(); + m_timer_trace.start(); + m_timer_setup_buffer.reset(); + m_timer_optix_launch.reset(); + m_timer_collect_results.reset(); + m_timer_memcpy.reset(); + m_timer_host_processing.reset(); + m_n_run_iterations = 0; while (N_ray_hit < m_number_of_rays && N_ray_gen < m_max_number_of_rays) { - ++n_iterations; + ++m_n_run_iterations; // Update ray offset (pushed to device in setup_device_buffer) data_manager->launch_params_H.ray_offset = N_ray_gen; // Allocate buffer (sets data_manager->launch_params_H buffer) - timer_setup_buffer.start(); + m_timer_setup_buffer.start(); { #ifdef NVTX_ENABLED nvtx3::scoped_range nvtx_setup{"setup_device_buffer"}; #endif setup_device_buffer(); } - timer_setup_buffer.stop(); + m_timer_setup_buffer.stop(); int width = data_manager->launch_params_H.width; int height = data_manager->launch_params_H.height; @@ -301,7 +315,7 @@ void SolTraceSystem::run() std::cout << "Memory used by launch: " << (m_mem_free_before - m_mem_free_after) / (1024.0 * 1024.0) << " MB\n"; // Launch the simulation. - timer_optix_launch.start(); + m_timer_optix_launch.start(); { #ifdef NVTX_ENABLED nvtx3::scoped_range nvtx_launch{"optixLaunch"}; @@ -317,10 +331,10 @@ void SolTraceSystem::run() 1)); CUDA_SYNC_CHECK(); } // nvtx_launch - timer_optix_launch.stop(); + m_timer_optix_launch.stop(); // Collect results - timer_collect_results.start(); + m_timer_collect_results.start(); { #ifdef NVTX_ENABLED nvtx3::scoped_range nvtx_collect{"get_buffer_results"}; @@ -328,7 +342,7 @@ void SolTraceSystem::run() get_buffer_results(m_hp_vec, m_raynumber_vec, m_element_id_vec, m_hit_type_vec, m_sunraynumber_vec); } - timer_collect_results.stop(); + m_timer_collect_results.stop(); N_ray_hit = m_raynumber_vec.empty() ? 0 : m_raynumber_vec.back(); N_ray_gen += width; @@ -351,14 +365,14 @@ void SolTraceSystem::run() if (m_verbose) { - const double t_setup = timer_setup_buffer.get_time_sec(); - const double t_launch = timer_optix_launch.get_time_sec(); - const double t_collect = timer_collect_results.get_time_sec(); + const double t_setup = m_timer_setup_buffer.get_time_sec(); + const double t_launch = m_timer_optix_launch.get_time_sec(); + const double t_collect = m_timer_collect_results.get_time_sec(); const double t_total = t_setup + t_launch + t_collect; - const double inv_n = n_iterations > 0 ? 1.0 / static_cast(n_iterations) : 0.0; + const double inv_n = m_n_run_iterations > 0 ? 1.0 / static_cast(m_n_run_iterations) : 0.0; - std::cout << "\n--- SolTraceSystem::run() timing (" << n_iterations << " iteration" - << (n_iterations == 1 ? "" : "s") << ") ---\n"; + std::cout << "\n--- SolTraceSystem::run() timing (" << m_n_run_iterations << " iteration" + << (m_n_run_iterations == 1 ? "" : "s") << ") ---\n"; std::cout << std::fixed << std::setprecision(6); std::cout << " setup_device_buffer : total = " << t_setup << " s" << " avg = " << t_setup * inv_n << " s" @@ -744,13 +758,16 @@ void SolTraceSystem::get_buffer_results(std::vector &hp_vec, std::vector // CUDA_CHECK(cudaMemcpy(m_hp_output_buffer_host.data(), data_manager->launch_params_H.hit_point_buffer, output_size * sizeof(float4), cudaMemcpyDeviceToHost)); // CUDA_CHECK(cudaMemcpy(m_element_id_buffer_host.data(), data_manager->launch_params_H.element_id_buffer, output_size * sizeof(int32_t), cudaMemcpyDeviceToHost)); // CUDA_CHECK(cudaMemcpy(m_hit_type_buffer_host.data(), data_manager->launch_params_H.hit_type_buffer, output_size * sizeof(uint8_t), cudaMemcpyDeviceToHost)); + m_timer_memcpy.start(); CUDA_CHECK(cudaMemcpy(m_hit_buffer_host, data_manager->launch_params_H.hit_buffer, output_size * sizeof(HitRecord), cudaMemcpyDeviceToHost)); + m_timer_memcpy.stop(); // Loop through each ray uint_fast64_t ray_number = raynumber_vec.empty() ? 0 : raynumber_vec.back(); uint_fast64_t sunray_number = sunraynumber_vec.empty() ? 0 : sunraynumber_vec.back(); HitRecord pending_record; bool pending_create = false; + m_timer_host_processing.start(); for (uint_fast64_t ray = 0; ray < num_rays; ++ray) { // bool ray_hit_something = false; @@ -805,6 +822,7 @@ void SolTraceSystem::get_buffer_results(std::vector &hp_vec, std::vector } } } + m_timer_host_processing.stop(); // // Loop through each buffer slot // uint_fast64_t ray_number = raynumber_vec.empty() ? 0 : raynumber_vec.back(); @@ -886,6 +904,63 @@ double SolTraceSystem::get_time_setup() return m_timer_setup.get_time_sec(); } +void SolTraceSystem::print_timing() const +{ + const double t_setup = m_timer_setup.get_time_sec(); + const double t_aabb = m_timer_aabb.get_time_sec(); + const double t_geometry = m_timer_geometry.get_time_sec(); + const double t_pipeline = m_timer_pipeline.get_time_sec(); + const double t_sbt = m_timer_sbt.get_time_sec(); + + const double t_trace = m_timer_trace.get_time_sec(); + const double t_buf_setup = m_timer_setup_buffer.get_time_sec(); + const double t_launch = m_timer_optix_launch.get_time_sec(); + const double t_collect = m_timer_collect_results.get_time_sec(); + const double t_memcpy = m_timer_memcpy.get_time_sec(); + const double t_host_proc = m_timer_host_processing.get_time_sec(); + + const double inv_n = m_n_run_iterations > 0 + ? 1.0 / static_cast(m_n_run_iterations) + : 0.0; + + const auto pct = [](double num, double denom) -> double { + return denom > 0.0 ? 100.0 * num / denom : 0.0; + }; + + std::cout << std::fixed << std::setprecision(6); + std::cout << "\n=== SolTraceSystem Timing Summary ===\n"; + + std::cout << "\n--- initialize() ---\n"; + std::cout << " AABB computation : " << t_aabb << " s (" << pct(t_aabb, t_setup) << " %)\n"; + std::cout << " Geometry creation : " << t_geometry << " s (" << pct(t_geometry, t_setup) << " %)\n"; + std::cout << " Pipeline creation : " << t_pipeline << " s (" << pct(t_pipeline, t_setup) << " %)\n"; + std::cout << " SBT creation : " << t_sbt << " s (" << pct(t_sbt, t_setup) << " %)\n"; + std::cout << " Total setup : " << t_setup << " s\n"; + + std::cout << "\n--- run() [" << m_n_run_iterations + << " iteration" << (m_n_run_iterations == 1 ? "" : "s") << "] ---\n"; + std::cout << " Setup device buffer : total = " << t_buf_setup << " s" + << " avg/iter = " << t_buf_setup * inv_n << " s" + << " (" << pct(t_buf_setup, t_trace) << " %)\n"; + std::cout << " OptiX launch : total = " << t_launch << " s" + << " avg/iter = " << t_launch * inv_n << " s" + << " (" << pct(t_launch, t_trace) << " %)\n"; + std::cout << " Collect results : total = " << t_collect << " s" + << " avg/iter = " << t_collect * inv_n << " s" + << " (" << pct(t_collect, t_trace) << " %)\n"; + std::cout << " memcpy D->H : total = " << t_memcpy << " s" + << " avg/iter = " << t_memcpy * inv_n << " s" + << " (" << pct(t_memcpy, t_collect) << " % of collect)\n"; + std::cout << " host processing : total = " << t_host_proc << " s" + << " avg/iter = " << t_host_proc * inv_n << " s" + << " (" << pct(t_host_proc, t_collect) << " % of collect)\n"; + std::cout << " Total trace : " << t_trace << " s\n"; + + std::cout << "\n--- Grand Total ---\n"; + std::cout << " Setup + Trace : " << (t_setup + t_trace) << " s\n"; + std::cout << "=====================================\n"; +} + double SolTraceSystem::get_sun_plane_area() const { const LaunchParams &lp = data_manager->launch_params_H; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index 7e2924b8..b0a89155 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -86,6 +86,10 @@ namespace OptixCSP double get_time_trace(); double get_time_setup(); + /// Print a formatted summary of all timing information collected during + /// the last initialize() and run() calls. + void print_timing() const; + void print_launch_params(); /// @@ -160,6 +164,21 @@ namespace OptixCSP Timer m_timer_setup; Timer m_timer_trace; + // initialize() sub-timers + Timer m_timer_aabb; + Timer m_timer_geometry; + Timer m_timer_pipeline; + Timer m_timer_sbt; + + // run() sub-timers + Timer m_timer_setup_buffer; + Timer m_timer_optix_launch; + Timer m_timer_collect_results; + // get_buffer_results() sub-timers + Timer m_timer_memcpy; + Timer m_timer_host_processing; + uint64_t m_n_run_iterations; + // memory usage size_t m_mem_free_before; size_t m_mem_free_after; diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index 22fe19e5..700321c2 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -25,6 +25,11 @@ void OptixRunner::set_verbose(bool verbose) m_sys.set_verbose(verbose); } +void OptixRunner::print_timing() const +{ + m_sys.print_timing(); +} + RunnerStatus OptixRunner::initialize() { // add elements to sys using data structure from SimulationData diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.hpp b/coretrace/simulation_runner/optix_runner/optix_runner.hpp index 4fd7f8eb..6b3e772e 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.hpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.hpp @@ -34,6 +34,8 @@ class OptixRunner : public SolTrace::Runner::SimulationRunner uint_fast64_t get_N_sun_rays() { return m_sys.get_N_sun_rays(); } + void print_timing() const; + void set_verbose(bool verbose); // Runner options From 8a3946ef77180cbb710f326b20725c08b3b50229 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 13 May 2026 14:43:18 -0600 Subject: [PATCH 24/60] Initialize curand states only once per call to OptixRunner::run_simulation --- .../OptixCSP/src/core/soltrace_system.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 798b1a92..7935b611 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -287,6 +287,17 @@ void SolTraceSystem::run() m_timer_host_processing.reset(); m_n_run_iterations = 0; + // Initialize RNG states once for the entire simulation run. + // curand states are persistent on the device and advance naturally across kernel launches. + { + const unsigned int num_rng_states = static_cast(m_number_of_rays); + data_manager->ensureCurandStates( + num_rng_states, + data_manager->launch_params_H.sun_dir_seed, + 0, + m_state.stream); + } + while (N_ray_hit < m_number_of_rays && N_ray_gen < m_max_number_of_rays) { ++m_n_run_iterations; @@ -718,13 +729,6 @@ void SolTraceSystem::setup_device_buffer() } CUDA_CHECK(cudaMemset(data_manager->launch_params_H.sun_dir_buffer, 0, sun_dir_size)); - const unsigned int num_rng_states = static_cast(data_manager->launch_params_H.width * data_manager->launch_params_H.height); - data_manager->ensureCurandStates( - num_rng_states, - data_manager->launch_params_H.sun_dir_seed, - data_manager->launch_params_H.ray_offset, - m_state.stream); - data_manager->updateLaunchParams(); } From 2e3ae6bee6d5d65cb496674659bd7a7b1794d389 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 13 May 2026 15:24:27 -0600 Subject: [PATCH 25/60] Rearrange memory allocation so it is done once at the beginning of the SolTraceSystem::run --- .../OptixCSP/src/core/soltrace_system.cpp | 223 +++--------------- .../OptixCSP/src/core/soltrace_system.h | 2 +- 2 files changed, 29 insertions(+), 196 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 7935b611..b167ac32 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -45,7 +45,6 @@ SolTraceSystem::SolTraceSystem() m_mem_free_after(0), m_optical_errors(false), m_hit_buffer_host(nullptr), - m_hit_buffer_host_capacity(0), m_include_sun_shape_errors(false), m_timer_setup(), m_timer_trace(), @@ -287,16 +286,8 @@ void SolTraceSystem::run() m_timer_host_processing.reset(); m_n_run_iterations = 0; - // Initialize RNG states once for the entire simulation run. - // curand states are persistent on the device and advance naturally across kernel launches. - { - const unsigned int num_rng_states = static_cast(m_number_of_rays); - data_manager->ensureCurandStates( - num_rng_states, - data_manager->launch_params_H.sun_dir_seed, - 0, - m_state.stream); - } + // Allocate device buffers and initialize RNG states once (sizes are constant across the while loop). + allocate_device_buffers(); while (N_ray_hit < m_number_of_rays && N_ray_gen < m_max_number_of_rays) { @@ -465,20 +456,11 @@ void SolTraceSystem::clean_up() CUDA_CHECK(cudaFree(reinterpret_cast(m_state.d_gas_output_buffer))); // Free device-side launch parameter memory - // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_point_buffer))); - // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.element_id_buffer))); - // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_type_buffer))); CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_buffer))); CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.sun_dir_buffer))); - // data_manager->launch_params_H.hit_point_buffer = nullptr; - // data_manager->launch_params_H.element_id_buffer = nullptr; - // data_manager->launch_params_H.hit_type_buffer = nullptr; data_manager->launch_params_H.hit_buffer = nullptr; data_manager->launch_params_H.sun_dir_buffer = nullptr; - // m_hit_point_buffer_size_allocated = 0; - // m_element_id_buffer_size_allocated = 0; - // m_hit_type_buffer_size_allocated = 0; m_hit_buffer_size_allocated = 0; m_sun_dir_buffer_size_allocated = 0; @@ -486,14 +468,6 @@ void SolTraceSystem::clean_up() CUDA_CHECK(cudaFreeHost(reinterpret_cast(m_hit_buffer_host))); m_hit_buffer_host = nullptr; - m_hit_buffer_host_capacity = 0; - - // m_hp_output_buffer_host.clear(); - // m_hp_output_buffer_host.shrink_to_fit(); - // m_element_id_buffer_host.clear(); - // m_element_id_buffer_host.shrink_to_fit(); - // m_hit_type_buffer_host.clear(); - // m_hit_type_buffer_host.shrink_to_fit(); m_state.context = nullptr; m_state.stream = nullptr; @@ -519,10 +493,6 @@ void SolTraceSystem::reset() m_hit_type_vec.clear(); m_sunraynumber_vec.clear(); - // m_hp_output_buffer_host.clear(); - // m_element_id_buffer_host.clear(); - // m_hit_type_buffer_host.clear(); - m_sun = nullptr; m_number_of_rays = 0; m_max_number_of_rays = 0; @@ -598,55 +568,6 @@ void SolTraceSystem::create_shader_binding_table() // initialize program handle and data OptixProgramGroup program_group_handle = pipeline_manager->getElementProgram(my_type); hitgroup_records_list[i].data.material_data = {0.875425, 0, 0, 0}; - // OptixProgramGroup program_group_handle = nullptr; - // SurfaceApertureMap map = {}; - - // switch (my_type) - // { - // case OptixCSP::OpticalEntityType::RECTANGLE_FLAT: - // map = {SurfaceType::FLAT, ApertureType::RECTANGLE}; - // program_group_handle = pipeline_manager->getElementProgram(map); - // hitgroup_records_list[i].data.material_data = {0.875425, 0, 0, 0}; - // printf("RECTANGLE_FLAT, program group address: %p \n", program_group_handle); - - // break; - - // case OptixCSP::OpticalEntityType::RECTANGLE_PARABOLIC: - // map = {SurfaceType::PARABOLIC, ApertureType::RECTANGLE}; - // program_group_handle = pipeline_manager->getElementProgram(map); - // hitgroup_records_list[i].data.material_data = {0.875425, 0, 0, 0}; - // printf("RECTANGLE_PARABOLIC, program group address: %p \n", program_group_handle); - - // break; - - // case OptixCSP::OpticalEntityType::CYLINDRICAL: - // map = {SurfaceType::CYLINDER, ApertureType::RECTANGLE}; - // program_group_handle = pipeline_manager->getElementProgram(map); - // hitgroup_records_list[i].data.material_data = {0.95, 0, 0, 0}; - // printf("CYLINDRICAL, program group address: %p \n", program_group_handle); - - // break; - - // case OptixCSP::OpticalEntityType::TRIANGLE_FLAT: - // map = {SurfaceType::FLAT, ApertureType::TRIANGLE}; - // program_group_handle = pipeline_manager->getElementProgram(map); - // hitgroup_records_list[i].data.material_data = {0.95, 0, 0, 0}; - // printf("FLAT_TRIANGLE, program group address: %p \n", program_group_handle); - - // break; - - // case OptixCSP::OpticalEntityType::QUADRILATERAL_FLAT: - // ma = {SurfaceType::FLAT, ApertureType::QUADRILATERAL}; - // program_group_handle = pipeline_manager->getElementProgram(map); - // hitgroup_records_list[i].data.material_data = {0.875425, 0, 0, 0}; - // printf("FLAT_QUADRILATERAL, program group address: %p \n", program_group_handle); - - // break; - - // default: - // std::cerr << "Unknown OpticalEntityType: " << my_type << std::endl; - // } - OPTIX_CHECK(optixSbtRecordPackHeader(program_group_handle, &hitgroup_records_list[i].header)); } @@ -671,62 +592,52 @@ void SolTraceSystem::create_shader_binding_table() } } -void SolTraceSystem::setup_device_buffer() +void SolTraceSystem::allocate_device_buffers() { - // Initialize launch params + // Set constant launch params (unchanged across the while loop). data_manager->launch_params_H.width = m_number_of_rays; data_manager->launch_params_H.height = 1; data_manager->launch_params_H.max_depth = MAX_TRACE_DEPTH; - // const size_t hit_point_buffer_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(float4) * data_manager->launch_params_H.max_depth; - // const size_t element_id_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(int32_t) * data_manager->launch_params_H.max_depth; - // const size_t hit_type_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(uint8_t) * data_manager->launch_params_H.max_depth; const size_t hit_buffer_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth * sizeof(HitRecord); const size_t sun_dir_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(float3); - // if (data_manager->launch_params_H.hit_point_buffer == nullptr || m_hit_point_buffer_size_allocated != hit_point_buffer_size) - // { - // if (data_manager->launch_params_H.hit_point_buffer != nullptr) - // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_point_buffer))); - // CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.hit_point_buffer), hit_point_buffer_size)); - // m_hit_point_buffer_size_allocated = hit_point_buffer_size; - // } - // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_point_buffer, 0, hit_point_buffer_size)); - - // if (data_manager->launch_params_H.element_id_buffer == nullptr || m_element_id_buffer_size_allocated != element_id_size) - // { - // if (data_manager->launch_params_H.element_id_buffer != nullptr) - // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.element_id_buffer))); - // CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.element_id_buffer), element_id_size)); - // m_element_id_buffer_size_allocated = element_id_size; - // } - // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.element_id_buffer, kElementIdBuffer, element_id_size)); - - // if (data_manager->launch_params_H.hit_type_buffer == nullptr || m_hit_type_buffer_size_allocated != hit_type_size) - // { - // if (data_manager->launch_params_H.hit_type_buffer != nullptr) - // CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_type_buffer))); - // CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.hit_type_buffer), hit_type_size)); - // m_hit_type_buffer_size_allocated = hit_type_size; - // } - // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_type_buffer, HitType::HIT_UNASSIGNED, hit_type_size)); + // NOTE: cudaFree is nullptr safe if (data_manager->launch_params_H.hit_buffer == nullptr || m_hit_buffer_size_allocated != hit_buffer_size) { - if (data_manager->launch_params_H.hit_buffer != nullptr) - CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_buffer))); + CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_buffer))); CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.hit_buffer), hit_buffer_size)); + CUDA_CHECK(cudaFreeHost(reinterpret_cast(m_hit_buffer_host))); + CUDA_CHECK(cudaMallocHost(reinterpret_cast(&m_hit_buffer_host), hit_buffer_size)); m_hit_buffer_size_allocated = hit_buffer_size; } - CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_buffer, 0, hit_buffer_size)); if (data_manager->launch_params_H.sun_dir_buffer == nullptr || m_sun_dir_buffer_size_allocated != sun_dir_size) { - if (data_manager->launch_params_H.sun_dir_buffer != nullptr) - CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.sun_dir_buffer))); + CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.sun_dir_buffer))); CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.sun_dir_buffer), sun_dir_size)); m_sun_dir_buffer_size_allocated = sun_dir_size; } + + // Initialize RNG states once (sizes are constant across the while loop). + // curand states are persistent on the device and advance naturally across kernel launches. + const unsigned int num_rng_states = static_cast( + data_manager->launch_params_H.width * data_manager->launch_params_H.height); + data_manager->ensureCurandStates( + num_rng_states, + data_manager->launch_params_H.sun_dir_seed, + 0, + m_state.stream); + +} + +void SolTraceSystem::setup_device_buffer() +{ + const size_t hit_buffer_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth * sizeof(HitRecord); + const size_t sun_dir_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(float3); + + CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_buffer, 0, hit_buffer_size)); CUDA_CHECK(cudaMemset(data_manager->launch_params_H.sun_dir_buffer, 0, sun_dir_size)); data_manager->updateLaunchParams(); @@ -746,22 +657,6 @@ void SolTraceSystem::get_buffer_results(std::vector &hp_vec, std::vector // const int output_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth; const uint_fast64_t output_size = num_rays * max_depth; - // if (static_cast(m_hp_output_buffer_host.size()) != output_size) - // m_hp_output_buffer_host.resize(output_size); - // if (static_cast(m_element_id_buffer_host.size()) != output_size) - // m_element_id_buffer_host.resize(output_size); - // if (static_cast(m_hit_type_buffer_host.size()) != output_size) - // m_hit_type_buffer_host.resize(output_size); - if (m_hit_buffer_host_capacity != output_size) - { - CUDA_CHECK(cudaFreeHost(reinterpret_cast(m_hit_buffer_host))); - CUDA_CHECK(cudaMallocHost(reinterpret_cast(&m_hit_buffer_host), output_size * sizeof(HitRecord))); - m_hit_buffer_host_capacity = output_size; - } - - // CUDA_CHECK(cudaMemcpy(m_hp_output_buffer_host.data(), data_manager->launch_params_H.hit_point_buffer, output_size * sizeof(float4), cudaMemcpyDeviceToHost)); - // CUDA_CHECK(cudaMemcpy(m_element_id_buffer_host.data(), data_manager->launch_params_H.element_id_buffer, output_size * sizeof(int32_t), cudaMemcpyDeviceToHost)); - // CUDA_CHECK(cudaMemcpy(m_hit_type_buffer_host.data(), data_manager->launch_params_H.hit_type_buffer, output_size * sizeof(uint8_t), cudaMemcpyDeviceToHost)); m_timer_memcpy.start(); CUDA_CHECK(cudaMemcpy(m_hit_buffer_host, data_manager->launch_params_H.hit_buffer, output_size * sizeof(HitRecord), cudaMemcpyDeviceToHost)); m_timer_memcpy.stop(); @@ -828,68 +723,6 @@ void SolTraceSystem::get_buffer_results(std::vector &hp_vec, std::vector } m_timer_host_processing.stop(); - // // Loop through each buffer slot - // uint_fast64_t ray_number = raynumber_vec.empty() ? 0 : raynumber_vec.back(); - // uint_fast64_t sunray_number = sunraynumber_vec.empty() ? 0 : sunraynumber_vec.back(); - // for (int i = 0; i < output_size; ++i) - // { - - // // Get hit type - // // const uint8_t &hit_type = m_hit_type_buffer_host[i]; - // const HitRecord &hr = m_hit_buffer_host[i]; - // const uint8_t &hit_type = hr.hit_type; - - // // Skip if empty - // if (hit_type < HitType::HIT_CREATE || hit_type > HitType::HIT_EXIT) - // { - // continue; - // } - - // // If new ray, check if previous ray hit anything - // if (hit_type == HitType::HIT_CREATE) - // { - // // Remove last ray if it has no hits - // if (!hit_type_vec.empty() && hit_type_vec.back() == HitType::HIT_CREATE) - // { - // hp_vec.pop_back(); - // raynumber_vec.pop_back(); - // hit_type_vec.pop_back(); - // element_id_vec.pop_back(); - // sunraynumber_vec.pop_back(); - // ray_number--; - // } - - // // New ray - // ray_number++; - - // // Sun ray number always increments, even if no hit - // sunray_number++; - // } - - // // Get hit record, element_id - // // const float4 &hit_record = m_hp_output_buffer_host[i]; // [depth, pos x, pos y, pos z] - // // const int32_t &element_id = m_element_id_buffer_host[i]; - // const float4 &hit_point = hr.hit_point; - // const int32_t &element_id = hr.element_id; - - // // Collect results - // hp_vec.push_back(hit_point); - // raynumber_vec.push_back(ray_number); - // hit_type_vec.push_back(hit_type); - // element_id_vec.push_back(element_id); - // sunraynumber_vec.push_back(sunray_number); - // } - - // // Remove last ray if it is only CREATE - // if (!hit_type_vec.empty() && hit_type_vec.back() == HitType::HIT_CREATE) - // { - // hp_vec.pop_back(); - // raynumber_vec.pop_back(); - // element_id_vec.pop_back(); - // hit_type_vec.pop_back(); - // sunraynumber_vec.pop_back(); - // } - return; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index b0a89155..2bdb75fd 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -142,7 +142,6 @@ namespace OptixCSP // Allocated with cudaMallocHost, deallocated with cudaFreeHost resulting in using // page-locked memory for faster transfers between device and host. HitRecord *m_hit_buffer_host; - uint_fast64_t m_hit_buffer_host_capacity; // std::vector m_hp_output_buffer_host; // std::vector m_element_id_buffer_host; // std::vector m_hit_type_buffer_host; @@ -156,6 +155,7 @@ namespace OptixCSP std::vector> m_element_list; void create_shader_binding_table(); + void allocate_device_buffers(); void setup_device_buffer(); void get_buffer_results(std::vector &hp_vec, std::vector &raynumber_vec, std::vector &element_id_vec, std::vector &hit_type_vec, From ff1b7abd27c4307684185d1fa7af3bacdaa6485c Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Thu, 14 May 2026 14:35:10 -0600 Subject: [PATCH 26/60] Move post tracing ray filtering to device --- .../embree_runner/embree_runner.cpp | 3 +- .../native_runner/native_runner.cpp | 4 +- .../OptixCSP/src/core/ray_utils.cu | 218 ++++++++++++++ .../OptixCSP/src/core/ray_utils.h | 50 ++++ .../OptixCSP/src/core/soltrace_system.cpp | 269 +++++++----------- .../OptixCSP/src/core/soltrace_system.h | 64 ++--- .../optix_runner/optix_runner.cpp | 13 +- .../optix_runner/two_plate_test.cpp | 4 +- 8 files changed, 419 insertions(+), 206 deletions(-) create mode 100644 coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu create mode 100644 coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h diff --git a/coretrace/simulation_runner/embree_runner/embree_runner.cpp b/coretrace/simulation_runner/embree_runner/embree_runner.cpp index 52484840..5f66d9e7 100644 --- a/coretrace/simulation_runner/embree_runner/embree_runner.cpp +++ b/coretrace/simulation_runner/embree_runner/embree_runner.cpp @@ -50,8 +50,7 @@ namespace SolTrace::EmbreeRunner { // TODO: Do a more efficient implementation of this? this->clean_embree(); - NativeRunner::update_simulation(data); - return RunnerStatus::SUCCESS; + return NativeRunner::update_simulation(data); } RunnerStatus EmbreeRunner::run_simulation() diff --git a/coretrace/simulation_runner/native_runner/native_runner.cpp b/coretrace/simulation_runner/native_runner/native_runner.cpp index 621eb07b..ca6341aa 100644 --- a/coretrace/simulation_runner/native_runner/native_runner.cpp +++ b/coretrace/simulation_runner/native_runner/native_runner.cpp @@ -285,8 +285,8 @@ namespace SolTrace::NativeRunner { // TODO: Do a more efficient implementation of this? this->tsys.ClearAll(); - this->setup_simulation(data); - return RunnerStatus::SUCCESS; + return this->setup_simulation(data); + // return RunnerStatus::SUCCESS; } RunnerStatus NativeRunner::run_simulation() diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu new file mode 100644 index 00000000..02a8466c --- /dev/null +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu @@ -0,0 +1,218 @@ +#include "ray_utils.h" + +#include "shaders/Soltrace.h" +#include "utils/util_check.hpp" + +#include +#include + +#include +#include + +namespace OptixCSP +{ + + // --------------------------------------------------------------------------- + // Pass 1 – count output records per ray. + // + // For each ray the loop reads up to max_depth HitRecord entries. + // raw_count accumulates every valid record (HIT_CREATE through HIT_EXIT). + // A ray with only a HIT_CREATE event (raw_count == 1) contributes 0 output + // records (it missed all elements). Any ray with raw_count > 1 contributes + // all raw_count records (CREATE + one or more hits). + // --------------------------------------------------------------------------- + __global__ static void count_ray_outputs( + const HitRecord *__restrict__ hit_buffer, + uint32_t num_rays, + uint32_t max_depth, + uint32_t *__restrict__ out_record_count, + uint32_t *__restrict__ out_has_hit) + { + const uint32_t ray = blockIdx.x * blockDim.x + threadIdx.x; + if (ray >= num_rays) + return; + + uint32_t raw_count = 0; + for (uint32_t depth = 0; depth < max_depth; ++depth) + { + const uint8_t ht = hit_buffer[max_depth * ray + depth].hit_type; + if (ht < HIT_CREATE || ht > HIT_EXIT) + break; + ++raw_count; + if (ht == HIT_ABSORB || ht == HIT_EXIT) + break; + } + + const uint32_t has_hit = (raw_count > 1) ? 1u : 0u; + out_record_count[ray] = has_hit ? raw_count : 0u; + out_has_hit[ray] = has_hit; + + return; + } + + // --------------------------------------------------------------------------- + // Pass 2 – write compacted records. + // + // Each thread handles one ray. Rays whose entry in the exclusive prefix-sum + // equals that of the next ray (i.e. record_count was 0) write nothing. + // For qualifying rays the CREATE record is written first, followed by all + // subsequent hit records, using the pre-computed offset as the base index. + // --------------------------------------------------------------------------- + __global__ static void compact_ray_outputs( + const HitRecord *__restrict__ hit_buffer, + uint32_t num_rays, + uint32_t max_depth, + const uint32_t *__restrict__ offsets, + const uint32_t *__restrict__ has_hit, + HitRecord *__restrict__ out_buffer) + { + const uint32_t ray = blockIdx.x * blockDim.x + threadIdx.x; + if (ray >= num_rays || !has_hit[ray]) + return; + + const HitRecord *ray_base = hit_buffer + max_depth * ray; + uint32_t out_idx = offsets[ray]; + + // Depth 0 is always HIT_CREATE for qualifying rays + out_buffer[out_idx++] = ray_base[0]; + + for (uint32_t depth = 1; depth < max_depth; ++depth) + { + const HitRecord &hr = ray_base[depth]; + const uint8_t ht = hr.hit_type; + + if (ht < HIT_CREATE || ht > HIT_EXIT) + break; + + out_buffer[out_idx++] = hr; + if (ht == HIT_ABSORB || ht == HIT_EXIT) + break; + } + + return; + } + + // --------------------------------------------------------------------------- + // Host-callable scratch management + // --------------------------------------------------------------------------- + void allocate_compaction_scratch(CompactionScratch &scratch, uint32_t num_rays, uint32_t max_depth) + { + free_compaction_scratch(scratch); + + CUDA_CHECK(cudaMalloc(&scratch.d_count, num_rays * sizeof(uint32_t))); + CUDA_CHECK(cudaMalloc(&scratch.d_offsets, num_rays * sizeof(uint32_t))); + CUDA_CHECK(cudaMalloc(&scratch.d_has_hit, num_rays * sizeof(uint32_t))); + CUDA_CHECK(cudaMalloc(&scratch.d_n_hit, sizeof(uint32_t))); + + // Query CUB temp-storage sizes using typed null pointers (size query only). + // scan_bytes must cover both ExclusiveSum and DeviceSelect::Flagged (d_scan_tmp is reused). + uint32_t *null_u32 = nullptr; + cub::DeviceScan::ExclusiveSum(scratch.d_scan_tmp, scratch.scan_bytes, null_u32, null_u32, num_rays); + cub::DeviceReduce::Sum(scratch.d_red_tmp, scratch.red_bytes, null_u32, null_u32, num_rays); + + size_t select_bytes = 0; + cub::CountingInputIterator count_iter(0u); + cub::DeviceSelect::Flagged(nullptr, select_bytes, count_iter, null_u32, null_u32, null_u32, num_rays); + if (select_bytes > scratch.scan_bytes) + scratch.scan_bytes = select_bytes; + + CUDA_CHECK(cudaMalloc(&scratch.d_scan_tmp, scratch.scan_bytes > 0 ? scratch.scan_bytes : 1)); + CUDA_CHECK(cudaMalloc(&scratch.d_red_tmp, scratch.red_bytes > 0 ? scratch.red_bytes : 1)); + + // Worst-case compacted output: every slot in the hit buffer could be kept + CUDA_CHECK(cudaMalloc(&scratch.d_compacted, num_rays * max_depth * sizeof(HitRecord))); + } + + void free_compaction_scratch(CompactionScratch &scratch) + { + // cudaFree is nullptr-safe + cudaFree(scratch.d_count); + cudaFree(scratch.d_offsets); + cudaFree(scratch.d_has_hit); + cudaFree(scratch.d_n_hit); + cudaFree(scratch.d_scan_tmp); + cudaFree(scratch.d_red_tmp); + cudaFree(scratch.d_compacted); + scratch = CompactionScratch{}; + } + + // --------------------------------------------------------------------------- + // Host-callable orchestrator + // --------------------------------------------------------------------------- + uint32_t gpu_compact_hit_buffer( + const HitRecord *d_hit_buffer, + uint32_t num_rays, + uint32_t max_depth, + uint32_t ray_offset, + std::vector &host_out, + std::vector &host_ray_ids, + cudaStream_t stream, + CompactionScratch &scratch) + { + if (num_rays == 0) + return 0; + + // ---- Pass 1: count records per ray ---- + const uint32_t block_size = 256; + const uint32_t grid_size = (num_rays + block_size - 1) / block_size; + count_ray_outputs<<>>( + d_hit_buffer, num_rays, max_depth, scratch.d_count, scratch.d_has_hit); + + // ---- Exclusive prefix-sum: d_count → d_offsets ---- + cub::DeviceScan::ExclusiveSum(scratch.d_scan_tmp, scratch.scan_bytes, scratch.d_count, scratch.d_offsets, num_rays, stream); + + // ---- Reduce: sum(d_has_hit) → d_n_hit ---- + cub::DeviceReduce::Sum(scratch.d_red_tmp, scratch.red_bytes, scratch.d_has_hit, scratch.d_n_hit, num_rays, stream); + + // ---- Synchronize to read back scalar results ---- + CUDA_CHECK(cudaStreamSynchronize(stream)); + + uint32_t last_offset = 0, last_count = 0, n_hit_rays = 0; + CUDA_CHECK(cudaMemcpy(&last_offset, scratch.d_offsets + (num_rays - 1), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(&last_count, scratch.d_count + (num_rays - 1), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(&n_hit_rays, scratch.d_n_hit, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + + const uint32_t total_records = last_offset + last_count; + + if (total_records > 0) + { + // ---- Pass 2: write compacted HitRecords to pre-allocated device buffer ---- + compact_ray_outputs<<>>( + d_hit_buffer, num_rays, max_depth, scratch.d_offsets, scratch.d_has_hit, scratch.d_compacted); + + // ---- After Pass 2 d_offsets is free; reuse it to compact global ray IDs ---- + // DeviceSelect::Flagged selects (ray_offset + i) for each i where d_has_hit[i] == 1. + // d_scan_tmp is also free (ExclusiveSum already completed). + cub::CountingInputIterator ray_id_iter(ray_offset); + cub::DeviceSelect::Flagged( + scratch.d_scan_tmp, scratch.scan_bytes, + ray_id_iter, scratch.d_has_hit, + scratch.d_offsets, // output: global IDs of hit rays + scratch.d_n_hit, // output count (already read; safe to overwrite) + num_rays, stream); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // ---- Copy compacted HitRecords to host ---- + const size_t prev_rec = host_out.size(); + host_out.resize(prev_rec + total_records); + CUDA_CHECK(cudaMemcpy( + host_out.data() + prev_rec, + scratch.d_compacted, + total_records * sizeof(HitRecord), + cudaMemcpyDeviceToHost)); + + // ---- Copy global ray IDs to host (one per logical hit ray) ---- + const size_t prev_ids = host_ray_ids.size(); + host_ray_ids.resize(prev_ids + n_hit_rays); + CUDA_CHECK(cudaMemcpy( + host_ray_ids.data() + prev_ids, + scratch.d_offsets, + n_hit_rays * sizeof(uint32_t), + cudaMemcpyDeviceToHost)); + } + + return n_hit_rays; + } + +} // namespace OptixCSP diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h new file mode 100644 index 00000000..4b82fd41 --- /dev/null +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include + +namespace OptixCSP +{ + + struct HitRecord; + + /// Device scratch buffers required by gpu_compact_hit_buffer. + /// Allocated once via allocate_compaction_scratch and reused across calls + /// as long as num_rays and max_depth stay the same. + struct CompactionScratch + { + uint32_t *d_count = nullptr; // per-ray output record count + uint32_t *d_offsets = nullptr; // exclusive prefix-sum of d_count + uint32_t *d_has_hit = nullptr; // 1 if ray contributes records, else 0 + uint32_t *d_n_hit = nullptr; // scalar: total hit rays + void *d_scan_tmp = nullptr; // CUB DeviceScan temp storage + size_t scan_bytes = 0; + void *d_red_tmp = nullptr; // CUB DeviceReduce temp storage + size_t red_bytes = 0; + HitRecord *d_compacted = nullptr; // worst-case compacted output (num_rays * max_depth) + }; + + /// Allocate all device scratch buffers for the given ray-buffer dimensions. + /// Frees any previous allocation before reallocating. + void allocate_compaction_scratch(CompactionScratch &scratch, uint32_t num_rays, uint32_t max_depth); + + /// Free all device scratch buffers and reset the struct to its default state. + void free_compaction_scratch(CompactionScratch &scratch); + + /// GPU-side stream compaction of the raw hit buffer. + /// Uses pre-allocated scratch buffers — no device allocations occur inside this call. + /// Appends compacted HitRecords to @p host_out and the corresponding global ray indices + /// (ray_offset + local_ray_index) to @p host_ray_ids (one entry per logical hit ray). + /// @returns Number of rays that produced at least one non-CREATE hit. + uint32_t gpu_compact_hit_buffer( + const HitRecord *d_hit_buffer, + uint32_t num_rays, + uint32_t max_depth, + uint32_t ray_offset, + std::vector &host_out, + std::vector &host_ray_ids, + cudaStream_t stream, + CompactionScratch &scratch); + +} // namespace OptixCSP diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index b167ac32..34a8e894 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -1,4 +1,5 @@ #include "soltrace_system.h" +#include "ray_utils.h" #include "CspElement.h" #include "data_manager.h" @@ -17,6 +18,7 @@ #include "utils/util_check.hpp" #include "utils/math_util.h" +#include #include #include #include @@ -44,7 +46,8 @@ SolTraceSystem::SolTraceSystem() m_mem_free_before(0), m_mem_free_after(0), m_optical_errors(false), - m_hit_buffer_host(nullptr), + m_n_hit_rays(0), + m_n_sun_rays(0), m_include_sun_shape_errors(false), m_timer_setup(), m_timer_trace(), @@ -55,8 +58,6 @@ SolTraceSystem::SolTraceSystem() m_timer_setup_buffer(), m_timer_optix_launch(), m_timer_collect_results(), - m_timer_memcpy(), - m_timer_host_processing(), m_n_run_iterations(0), geometry_manager(std::make_shared(m_state, m_verbose)), data_manager(std::make_shared()), @@ -269,11 +270,10 @@ void SolTraceSystem::run() NVTX3_FUNC_RANGE(); #endif - // Initialize results vectors - m_hp_vec.clear(); - m_raynumber_vec.clear(); - m_element_id_vec.clear(); - m_hit_type_vec.clear(); + // Initialize results + m_hit_records.clear(); + m_n_hit_rays = 0; + m_n_sun_rays = 0; uint_fast64_t N_ray_hit = 0; uint_fast64_t N_ray_gen = 0; @@ -282,8 +282,6 @@ void SolTraceSystem::run() m_timer_setup_buffer.reset(); m_timer_optix_launch.reset(); m_timer_collect_results.reset(); - m_timer_memcpy.reset(); - m_timer_host_processing.reset(); m_n_run_iterations = 0; // Allocate device buffers and initialize RNG states once (sizes are constant across the while loop). @@ -320,18 +318,18 @@ void SolTraceSystem::run() m_timer_optix_launch.start(); { #ifdef NVTX_ENABLED - nvtx3::scoped_range nvtx_launch{"optixLaunch"}; + nvtx3::scoped_range nvtx_launch{"optixLaunch"}; #endif - OPTIX_CHECK(optixLaunch( - m_state.pipeline, - m_state.stream, // Assume this stream is properly created. - reinterpret_cast(data_manager->getDeviceLaunchParams()), - sizeof(OptixCSP::LaunchParams), - &m_state.sbt, // Shader Binding Table. - width, // Launch dimensions - height, - 1)); - CUDA_SYNC_CHECK(); + OPTIX_CHECK(optixLaunch( + m_state.pipeline, + m_state.stream, // Assume this stream is properly created. + reinterpret_cast(data_manager->getDeviceLaunchParams()), + sizeof(OptixCSP::LaunchParams), + &m_state.sbt, // Shader Binding Table. + width, // Launch dimensions + height, + 1)); + CUDA_SYNC_CHECK(); } // nvtx_launch m_timer_optix_launch.stop(); @@ -341,27 +339,32 @@ void SolTraceSystem::run() #ifdef NVTX_ENABLED nvtx3::scoped_range nvtx_collect{"get_buffer_results"}; #endif - get_buffer_results(m_hp_vec, m_raynumber_vec, m_element_id_vec, m_hit_type_vec, - m_sunraynumber_vec); + get_buffer_results(); } m_timer_collect_results.stop(); - N_ray_hit = m_raynumber_vec.empty() ? 0 : m_raynumber_vec.back(); + N_ray_hit = m_n_hit_rays; N_ray_gen += width; + m_n_sun_rays = N_ray_gen; } - // Trim excess rays - if (N_ray_hit > m_number_of_rays) + // Trim excess rays: remove ray groups from the tail until m_n_hit_rays == m_number_of_rays. + // Each group starts at the last HIT_CREATE record in m_hit_records. + while (m_n_hit_rays > m_number_of_rays && !m_hit_records.empty()) { - while (m_raynumber_vec.back() > m_number_of_rays) - { - m_hp_vec.pop_back(); - m_raynumber_vec.pop_back(); - m_element_id_vec.pop_back(); - m_hit_type_vec.pop_back(); - m_sunraynumber_vec.pop_back(); - } + // Walk backwards to find the last CREATE record + auto rit = std::find_if(m_hit_records.rbegin(), m_hit_records.rend(), + [](const HitRecord &r) + { return r.hit_type == HitType::HIT_CREATE; }); + if (rit == m_hit_records.rend()) + break; + m_hit_records.erase(std::prev(rit.base()), m_hit_records.end()); + m_hit_ray_ids.pop_back(); + --m_n_hit_rays; } + // m_n_sun_rays = rays generated up to and including the last retained hit ray. + if (!m_hit_ray_ids.empty()) + m_n_sun_rays = static_cast(m_hit_ray_ids.back()) + 1; m_timer_trace.stop(); @@ -394,9 +397,6 @@ void SolTraceSystem::update() { const int N_slots = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth; - // const size_t hit_point_buffer_size = N_slots * sizeof(float4); - // const size_t element_id_size = N_slots * sizeof(int32_t); - // const size_t hit_type_buffer_size = N_slots * sizeof(uint8_t); const size_t hit_buffer_size = N_slots * sizeof(HitRecord); // update aabb and sun plane accordingly @@ -404,9 +404,6 @@ void SolTraceSystem::update() // update data on the device data_manager->updateGeometryDataArray(geometry_manager->get_geometry_data_array()); - // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_point_buffer, 0, hit_point_buffer_size)); - // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.element_id_buffer, kElementIdBuffer, element_id_size)); - // CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_type_buffer, HitType::HIT_UNASSIGNED, hit_type_buffer_size)); CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_buffer, 0, hit_buffer_size)); data_manager->updateLaunchParams(); @@ -417,10 +414,25 @@ void SolTraceSystem::get_hp_output(std::vector &hp_vec, std::vector &element_id_vec, std::vector &hit_type_vec) { - hp_vec = m_hp_vec; - raynumber_vec = m_raynumber_vec; - element_id_vec = m_element_id_vec; - hit_type_vec = m_hit_type_vec; + hp_vec.clear(); + raynumber_vec.clear(); + element_id_vec.clear(); + hit_type_vec.clear(); + hp_vec.reserve(m_hit_records.size()); + raynumber_vec.reserve(m_hit_records.size()); + element_id_vec.reserve(m_hit_records.size()); + hit_type_vec.reserve(m_hit_records.size()); + + uint_fast64_t ray_number = 0; + for (const HitRecord &r : m_hit_records) + { + if (r.hit_type == HitType::HIT_CREATE) + ++ray_number; + hp_vec.push_back(r.hit_point); + raynumber_vec.push_back(ray_number); + element_id_vec.push_back(r.element_id); + hit_type_vec.push_back(r.hit_type); + } } void SolTraceSystem::clean_up() @@ -464,10 +476,9 @@ void SolTraceSystem::clean_up() m_hit_buffer_size_allocated = 0; m_sun_dir_buffer_size_allocated = 0; - data_manager->cleanup(); + free_compaction_scratch(m_compaction_scratch); - CUDA_CHECK(cudaFreeHost(reinterpret_cast(m_hit_buffer_host))); - m_hit_buffer_host = nullptr; + data_manager->cleanup(); m_state.context = nullptr; m_state.stream = nullptr; @@ -487,11 +498,10 @@ void SolTraceSystem::reset() clean_up(); m_element_list.clear(); - m_hp_vec.clear(); - m_raynumber_vec.clear(); - m_element_id_vec.clear(); - m_hit_type_vec.clear(); - m_sunraynumber_vec.clear(); + m_hit_records.clear(); + m_hit_ray_ids.clear(); + m_n_hit_rays = 0; + m_n_sun_rays = 0; m_sun = nullptr; m_number_of_rays = 0; @@ -608,9 +618,12 @@ void SolTraceSystem::allocate_device_buffers() { CUDA_CHECK(cudaFree(reinterpret_cast(data_manager->launch_params_H.hit_buffer))); CUDA_CHECK(cudaMalloc(reinterpret_cast(&data_manager->launch_params_H.hit_buffer), hit_buffer_size)); - CUDA_CHECK(cudaFreeHost(reinterpret_cast(m_hit_buffer_host))); - CUDA_CHECK(cudaMallocHost(reinterpret_cast(&m_hit_buffer_host), hit_buffer_size)); m_hit_buffer_size_allocated = hit_buffer_size; + + // Reallocate compaction scratch whenever ray-buffer dimensions change + const uint32_t num_rays = data_manager->launch_params_H.width * data_manager->launch_params_H.height; + const uint32_t max_depth = static_cast(data_manager->launch_params_H.max_depth); + allocate_compaction_scratch(m_compaction_scratch, num_rays, max_depth); } if (data_manager->launch_params_H.sun_dir_buffer == nullptr || m_sun_dir_buffer_size_allocated != sun_dir_size) @@ -629,7 +642,6 @@ void SolTraceSystem::allocate_device_buffers() data_manager->launch_params_H.sun_dir_seed, 0, m_state.stream); - } void SolTraceSystem::setup_device_buffer() @@ -643,87 +655,29 @@ void SolTraceSystem::setup_device_buffer() data_manager->updateLaunchParams(); } -// Collects results from device buffer -// only keeps rays that hit elements -void SolTraceSystem::get_buffer_results(std::vector &hp_vec, std::vector &raynumber_vec, - std::vector &element_id_vec, std::vector &hit_type_vec, - std::vector &sunraynumber_vec) +// Compacts the device hit buffer on the GPU, then copies only qualifying records to host. +// Rays that produced only a HIT_CREATE event (missed all elements) are discarded. +// Empty depth slots are discarded. The compacted HitRecord array is appended to +// m_hit_records and m_n_hit_rays is incremented by the number of newly collected hit rays. +void SolTraceSystem::get_buffer_results() { #ifdef NVTX_ENABLED NVTX3_FUNC_RANGE(); #endif - const uint_fast64_t max_depth = data_manager->launch_params_H.max_depth; - const uint_fast64_t num_rays = data_manager->launch_params_H.width * data_manager->launch_params_H.height; - // const int output_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth; - const uint_fast64_t output_size = num_rays * max_depth; - - m_timer_memcpy.start(); - CUDA_CHECK(cudaMemcpy(m_hit_buffer_host, data_manager->launch_params_H.hit_buffer, output_size * sizeof(HitRecord), cudaMemcpyDeviceToHost)); - m_timer_memcpy.stop(); - - // Loop through each ray - uint_fast64_t ray_number = raynumber_vec.empty() ? 0 : raynumber_vec.back(); - uint_fast64_t sunray_number = sunraynumber_vec.empty() ? 0 : sunraynumber_vec.back(); - HitRecord pending_record; - bool pending_create = false; - m_timer_host_processing.start(); - for (uint_fast64_t ray = 0; ray < num_rays; ++ray) - { - // bool ray_hit_something = false; - for (uint_fast64_t depth = 0; depth < max_depth; ++depth) - { - uint_fast64_t idx = max_depth * ray + depth; - const HitRecord &hr = m_hit_buffer_host[idx]; - const uint8_t &hit_type = hr.hit_type; - - if (hit_type < HitType::HIT_CREATE || hit_type > HitType::HIT_EXIT) - { - // Hit end of ray history--go to next ray - break; - } - - if (hit_type == HitType::HIT_CREATE) - { - // New ray -- capture data and wait to see if the ray hit anything - pending_record = hr; - pending_create = true; - sunray_number++; - } - else - { - // Any invalid hit types have already been handled by the first if block - - // Clear the pending data if necessary - if (pending_create) - { - pending_create = false; - ray_number++; - hp_vec.push_back(pending_record.hit_point); - raynumber_vec.push_back(ray_number); - hit_type_vec.push_back(pending_record.hit_type); - element_id_vec.push_back(pending_record.element_id); - sunraynumber_vec.push_back(sunray_number); - } - - hp_vec.push_back(hr.hit_point); - raynumber_vec.push_back(ray_number); - hit_type_vec.push_back(hit_type); - element_id_vec.push_back(hr.element_id); - sunraynumber_vec.push_back(sunray_number); - - if (hit_type == HitType::HIT_ABSORB || hit_type == HitType::HIT_EXIT) - { - // Ray has terminated. Go to the next. - // NOTE: As of this writing, OptixRunner does not mark rays - // with HIT_EXIT. Include it here anyway. - break; - } - } - } - } - m_timer_host_processing.stop(); - - return; + const uint32_t num_rays = static_cast(data_manager->launch_params_H.width * + data_manager->launch_params_H.height); + const uint32_t max_depth = static_cast(data_manager->launch_params_H.max_depth); + + const uint32_t n_new_hits = gpu_compact_hit_buffer( + data_manager->launch_params_H.hit_buffer, + num_rays, + max_depth, + data_manager->launch_params_H.ray_offset, + m_hit_records, + m_hit_ray_ids, + m_state.stream, + m_compaction_scratch); + m_n_hit_rays += n_new_hits; } void SolTraceSystem::add_element(std::shared_ptr e) @@ -743,24 +697,23 @@ double SolTraceSystem::get_time_setup() void SolTraceSystem::print_timing() const { - const double t_setup = m_timer_setup.get_time_sec(); - const double t_aabb = m_timer_aabb.get_time_sec(); - const double t_geometry = m_timer_geometry.get_time_sec(); - const double t_pipeline = m_timer_pipeline.get_time_sec(); - const double t_sbt = m_timer_sbt.get_time_sec(); - - const double t_trace = m_timer_trace.get_time_sec(); - const double t_buf_setup = m_timer_setup_buffer.get_time_sec(); - const double t_launch = m_timer_optix_launch.get_time_sec(); - const double t_collect = m_timer_collect_results.get_time_sec(); - const double t_memcpy = m_timer_memcpy.get_time_sec(); - const double t_host_proc = m_timer_host_processing.get_time_sec(); + const double t_setup = m_timer_setup.get_time_sec(); + const double t_aabb = m_timer_aabb.get_time_sec(); + const double t_geometry = m_timer_geometry.get_time_sec(); + const double t_pipeline = m_timer_pipeline.get_time_sec(); + const double t_sbt = m_timer_sbt.get_time_sec(); + + const double t_trace = m_timer_trace.get_time_sec(); + const double t_buf_setup = m_timer_setup_buffer.get_time_sec(); + const double t_launch = m_timer_optix_launch.get_time_sec(); + const double t_collect = m_timer_collect_results.get_time_sec(); const double inv_n = m_n_run_iterations > 0 ? 1.0 / static_cast(m_n_run_iterations) : 0.0; - const auto pct = [](double num, double denom) -> double { + const auto pct = [](double num, double denom) -> double + { return denom > 0.0 ? 100.0 * num / denom : 0.0; }; @@ -768,30 +721,24 @@ void SolTraceSystem::print_timing() const std::cout << "\n=== SolTraceSystem Timing Summary ===\n"; std::cout << "\n--- initialize() ---\n"; - std::cout << " AABB computation : " << t_aabb << " s (" << pct(t_aabb, t_setup) << " %)\n"; + std::cout << " AABB computation : " << t_aabb << " s (" << pct(t_aabb, t_setup) << " %)\n"; std::cout << " Geometry creation : " << t_geometry << " s (" << pct(t_geometry, t_setup) << " %)\n"; std::cout << " Pipeline creation : " << t_pipeline << " s (" << pct(t_pipeline, t_setup) << " %)\n"; - std::cout << " SBT creation : " << t_sbt << " s (" << pct(t_sbt, t_setup) << " %)\n"; - std::cout << " Total setup : " << t_setup << " s\n"; + std::cout << " SBT creation : " << t_sbt << " s (" << pct(t_sbt, t_setup) << " %)\n"; + std::cout << " Total setup : " << t_setup << " s\n"; std::cout << "\n--- run() [" << m_n_run_iterations << " iteration" << (m_n_run_iterations == 1 ? "" : "s") << "] ---\n"; std::cout << " Setup device buffer : total = " << t_buf_setup << " s" << " avg/iter = " << t_buf_setup * inv_n << " s" << " (" << pct(t_buf_setup, t_trace) << " %)\n"; - std::cout << " OptiX launch : total = " << t_launch << " s" - << " avg/iter = " << t_launch * inv_n << " s" - << " (" << pct(t_launch, t_trace) << " %)\n"; - std::cout << " Collect results : total = " << t_collect << " s" - << " avg/iter = " << t_collect * inv_n << " s" - << " (" << pct(t_collect, t_trace) << " %)\n"; - std::cout << " memcpy D->H : total = " << t_memcpy << " s" - << " avg/iter = " << t_memcpy * inv_n << " s" - << " (" << pct(t_memcpy, t_collect) << " % of collect)\n"; - std::cout << " host processing : total = " << t_host_proc << " s" - << " avg/iter = " << t_host_proc * inv_n << " s" - << " (" << pct(t_host_proc, t_collect) << " % of collect)\n"; - std::cout << " Total trace : " << t_trace << " s\n"; + std::cout << " OptiX launch : total = " << t_launch << " s" + << " avg/iter = " << t_launch * inv_n << " s" + << " (" << pct(t_launch, t_trace) << " %)\n"; + std::cout << " Collect results : total = " << t_collect << " s" + << " avg/iter = " << t_collect * inv_n << " s" + << " (" << pct(t_collect, t_trace) << " %)\n"; + std::cout << " Total trace : " << t_trace << " s\n"; std::cout << "\n--- Grand Total ---\n"; std::cout << " Setup + Trace : " << (t_setup + t_trace) << " s\n"; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index 2bdb75fd..cd2394ae 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -9,8 +9,10 @@ #include "core/soltrace_state.h" // SoltraceState #include "core/vec3d.h" // Vec3d #include "core/timer.h" -#include "core/CspElement.h" // CspElement -#include "core/Surface.h" // Surface and derived classes +#include "core/CspElement.h" // CspElement +#include "core/Surface.h" // Surface and derived classes +#include "shaders/Soltrace.h" // HitRecord, HitType +#include "ray_utils.h" // CompactionScratch #include "../../../../../simulation_data/simulation_data_export.hpp" @@ -24,8 +26,6 @@ namespace OptixCSP class Vec3d; class Surface; - struct HitRecord; - static constexpr SolTrace::Data::SunShape kSupportedSunshapes[] = { SolTrace::Data::SunShape::GAUSSIAN, SolTrace::Data::SunShape::PILLBOX, @@ -98,14 +98,13 @@ namespace OptixCSP /// double get_sun_plane_area() const; - uint_fast64_t get_N_sun_rays() - { - if (m_sunraynumber_vec.empty()) - return 0; - return m_sunraynumber_vec.back(); - } + uint_fast64_t get_N_sun_rays() const { return m_n_sun_rays; } + + /// Returns the compacted hit records (CREATE + hits, misses excluded). + const std::vector &get_hit_records() const { return m_hit_records; } - std::vector get_sunraynumber_vec() const { return m_sunraynumber_vec; } + /// Returns the number of rays that hit at least one element. + uint_fast64_t get_N_hit_rays() const { return m_n_hit_rays; } void set_sun_shape_errors(bool flag) { this->m_include_sun_shape_errors = flag; } private: @@ -131,35 +130,35 @@ namespace OptixCSP // Results - // Contains information on rays that hit objects - std::vector m_hp_vec; - std::vector m_raynumber_vec; - std::vector m_element_id_vec; - std::vector m_hit_type_vec; - std::vector m_sunraynumber_vec; // This is ID of hit rays out of all generated rays - - // Reused host-side scratch buffers for copying launch results back from device. - // Allocated with cudaMallocHost, deallocated with cudaFreeHost resulting in using - // page-locked memory for faster transfers between device and host. - HitRecord *m_hit_buffer_host; - // std::vector m_hp_output_buffer_host; - // std::vector m_element_id_buffer_host; - // std::vector m_hit_type_buffer_host; + // Compacted hit records: one contiguous array of HitRecord. + // Each ray group starts with a HIT_CREATE record followed by its hits. + // Rays that produced no hits (CREATE-only) are excluded. + std::vector m_hit_records; + + // Global ray index (ray_offset + local_index) for each logical hit ray in m_hit_records. + // Parallel to the logical rays (not records): m_hit_ray_ids.size() == m_n_hit_rays. + std::vector m_hit_ray_ids; + + // Count of rays that produced at least one non-CREATE hit. + uint_fast64_t m_n_hit_rays = 0; + + // Total rays generated (launched from the sun plane) across all run() iterations. + uint_fast64_t m_n_sun_rays = 0; // Current allocated device launch buffer sizes. - // size_t m_hit_point_buffer_size_allocated = 0; - // size_t m_element_id_buffer_size_allocated = 0; - // size_t m_hit_type_buffer_size_allocated = 0; size_t m_hit_buffer_size_allocated = 0; size_t m_sun_dir_buffer_size_allocated = 0; + // Pre-allocated device scratch buffers for GPU stream compaction. + CompactionScratch m_compaction_scratch; + std::vector> m_element_list; void create_shader_binding_table(); void allocate_device_buffers(); void setup_device_buffer(); - void get_buffer_results(std::vector &hp_vec, std::vector &raynumber_vec, - std::vector &element_id_vec, std::vector &hit_type_vec, - std::vector &sunraynumber_vec); + // GPU-side compaction: count hits, compact buffer on device, copy result to m_hit_records. + // Increments m_n_hit_rays by the number of newly collected hit rays. + void get_buffer_results(); Timer m_timer_setup; Timer m_timer_trace; @@ -174,9 +173,6 @@ namespace OptixCSP Timer m_timer_setup_buffer; Timer m_timer_optix_launch; Timer m_timer_collect_results; - // get_buffer_results() sub-timers - Timer m_timer_memcpy; - Timer m_timer_host_processing; uint64_t m_n_run_iterations; // memory usage diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index 700321c2..81673d3a 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -133,7 +133,7 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) auto optix_el = std::make_shared(); auto origin = el->get_origin_global(); auto ap = el->get_aim_vector_global(); - OptixCSP::Vec3d origin_vec(origin.x, origin.y, origin.z); + // OptixCSP::Vec3d origin_vec(origin.x, origin.y, origin.z); optix_el->set_origin(ToVec3d(origin)); optix_el->set_aim_point(ToVec3d(ap)); optix_el->set_rotation_matrix(ToMatrix33d(el->get_local_to_global())); @@ -355,9 +355,10 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) RunnerStatus OptixRunner::update_simulation(const SimulationData *data) { - this->setup_simulation(data); - // TODO: Implement this - return RunnerStatus::SUCCESS; + // TODO: Need this call? + // this->m_sys.clean_up(); + return this->setup_simulation(data); + // TODO: Implement this in a less lazy manner... } RunnerStatus OptixRunner::run_simulation() @@ -406,6 +407,8 @@ RunnerStatus OptixRunner::report_simulation(SimulationResult *result, std::map ray_records; std::map::iterator iter; + // TODO: This should be redone without using these vectors and just using the + // internal hit record vector // Get results from optixcsp std::vector hp_vec; std::vector raynumber_vec; @@ -422,7 +425,7 @@ RunnerStatus OptixRunner::report_simulation(SimulationResult *result, // Loop through data, populating ray records // Assumes ray data is grouped serially size_t ndata = hp_vec.size(); - uint_fast64_t raynum_prev = -1; + // uint_fast64_t raynum_prev = -1; uint_fast64_t raynum = 0; SolTrace::Result::ray_record_ptr rec = nullptr; SolTrace::Result::interaction_ptr intr = nullptr; diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp index 17385bee..b46c447c 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp @@ -151,12 +151,12 @@ TEST(TwoPlateOptix, ReflectionToAbsorber) std::vector element_id_vec; std::vector hit_type_vec; sys->get_hp_output(hp_vec, raynumber_vec, element_id_vec, hit_type_vec); - std::vector sunraynumber_vec = sys->get_sunraynumber_vec(); + // std::vector sunraynumber_vec = sys->get_sunraynumber_vec(); EXPECT_EQ(hp_vec.size(), raynumber_vec.size()); // Hit results are same size EXPECT_EQ(raynumber_vec.size(), element_id_vec.size()); EXPECT_EQ(element_id_vec.size(), hit_type_vec.size()); - EXPECT_EQ(N_sun_rays, sunraynumber_vec.back()); // Reported sun rays is the sun ray id of last hit + // EXPECT_EQ(N_sun_rays, sunraynumber_vec.back()); // Reported sun rays is the sun ray id of last hit EXPECT_TRUE(N_sun_rays <= sd.get_simulation_parameters().max_number_of_rays); // Only generated max number of rays or fewer } From efc7e596cde25cf13e91e54ddf9f25577d56659e Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Thu, 14 May 2026 15:06:01 -0600 Subject: [PATCH 27/60] Add timers for ray filtering; fix compiler warning --- .../OptixCSP/src/core/ray_utils.cu | 60 ++++++++++++++++++- .../OptixCSP/src/core/ray_utils.h | 21 ++++++- .../OptixCSP/src/core/soltrace_system.cpp | 18 +++++- .../OptixCSP/src/core/soltrace_system.h | 3 +- 4 files changed, 96 insertions(+), 6 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu index 02a8466c..109bebae 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu @@ -5,7 +5,9 @@ #include #include +#include +#include #include #include @@ -111,7 +113,7 @@ namespace OptixCSP cub::DeviceReduce::Sum(scratch.d_red_tmp, scratch.red_bytes, null_u32, null_u32, num_rays); size_t select_bytes = 0; - cub::CountingInputIterator count_iter(0u); + thrust::counting_iterator count_iter(0u); cub::DeviceSelect::Flagged(nullptr, select_bytes, count_iter, null_u32, null_u32, null_u32, num_rays); if (select_bytes > scratch.scan_bytes) scratch.scan_bytes = select_bytes; @@ -121,6 +123,12 @@ namespace OptixCSP // Worst-case compacted output: every slot in the hit buffer could be kept CUDA_CHECK(cudaMalloc(&scratch.d_compacted, num_rays * max_depth * sizeof(HitRecord))); + + // CUDA events for GPU-phase timing + CUDA_CHECK(cudaEventCreate(&scratch.e_gpu1_start)); + CUDA_CHECK(cudaEventCreate(&scratch.e_gpu1_stop)); + CUDA_CHECK(cudaEventCreate(&scratch.e_gpu2_start)); + CUDA_CHECK(cudaEventCreate(&scratch.e_gpu2_stop)); } void free_compaction_scratch(CompactionScratch &scratch) @@ -133,6 +141,11 @@ namespace OptixCSP cudaFree(scratch.d_scan_tmp); cudaFree(scratch.d_red_tmp); cudaFree(scratch.d_compacted); + // cudaEventDestroy is not nullptr-safe + if (scratch.e_gpu1_start) cudaEventDestroy(scratch.e_gpu1_start); + if (scratch.e_gpu1_stop) cudaEventDestroy(scratch.e_gpu1_stop); + if (scratch.e_gpu2_start) cudaEventDestroy(scratch.e_gpu2_start); + if (scratch.e_gpu2_stop) cudaEventDestroy(scratch.e_gpu2_stop); scratch = CompactionScratch{}; } @@ -147,7 +160,8 @@ namespace OptixCSP std::vector &host_out, std::vector &host_ray_ids, cudaStream_t stream, - CompactionScratch &scratch) + CompactionScratch &scratch, + CompactionTimings *timings) { if (num_rays == 0) return 0; @@ -155,6 +169,9 @@ namespace OptixCSP // ---- Pass 1: count records per ray ---- const uint32_t block_size = 256; const uint32_t grid_size = (num_rays + block_size - 1) / block_size; + + if (timings) CUDA_CHECK(cudaEventRecord(scratch.e_gpu1_start, stream)); + count_ray_outputs<<>>( d_hit_buffer, num_rays, max_depth, scratch.d_count, scratch.d_has_hit); @@ -164,26 +181,45 @@ namespace OptixCSP // ---- Reduce: sum(d_has_hit) → d_n_hit ---- cub::DeviceReduce::Sum(scratch.d_red_tmp, scratch.red_bytes, scratch.d_has_hit, scratch.d_n_hit, num_rays, stream); + if (timings) CUDA_CHECK(cudaEventRecord(scratch.e_gpu1_stop, stream)); + // ---- Synchronize to read back scalar results ---- CUDA_CHECK(cudaStreamSynchronize(stream)); + if (timings) + { + float ms = 0.f; + CUDA_CHECK(cudaEventElapsedTime(&ms, scratch.e_gpu1_start, scratch.e_gpu1_stop)); + timings->gpu_phase1_ms += ms; + } + + // ---- D→H scalar memcpy (CPU wall-clock) ---- + std::chrono::high_resolution_clock::time_point t_scalar; + if (timings) t_scalar = std::chrono::high_resolution_clock::now(); + uint32_t last_offset = 0, last_count = 0, n_hit_rays = 0; CUDA_CHECK(cudaMemcpy(&last_offset, scratch.d_offsets + (num_rays - 1), sizeof(uint32_t), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(&last_count, scratch.d_count + (num_rays - 1), sizeof(uint32_t), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(&n_hit_rays, scratch.d_n_hit, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + if (timings) + timings->scalar_dth_ms += std::chrono::duration( + std::chrono::high_resolution_clock::now() - t_scalar).count(); + const uint32_t total_records = last_offset + last_count; if (total_records > 0) { // ---- Pass 2: write compacted HitRecords to pre-allocated device buffer ---- + if (timings) CUDA_CHECK(cudaEventRecord(scratch.e_gpu2_start, stream)); + compact_ray_outputs<<>>( d_hit_buffer, num_rays, max_depth, scratch.d_offsets, scratch.d_has_hit, scratch.d_compacted); // ---- After Pass 2 d_offsets is free; reuse it to compact global ray IDs ---- // DeviceSelect::Flagged selects (ray_offset + i) for each i where d_has_hit[i] == 1. // d_scan_tmp is also free (ExclusiveSum already completed). - cub::CountingInputIterator ray_id_iter(ray_offset); + thrust::counting_iterator ray_id_iter(ray_offset); cub::DeviceSelect::Flagged( scratch.d_scan_tmp, scratch.scan_bytes, ray_id_iter, scratch.d_has_hit, @@ -191,8 +227,21 @@ namespace OptixCSP scratch.d_n_hit, // output count (already read; safe to overwrite) num_rays, stream); + if (timings) CUDA_CHECK(cudaEventRecord(scratch.e_gpu2_stop, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + if (timings) + { + float ms = 0.f; + CUDA_CHECK(cudaEventElapsedTime(&ms, scratch.e_gpu2_start, scratch.e_gpu2_stop)); + timings->gpu_phase2_ms += ms; + } + + // ---- D→H bulk memcpy (CPU wall-clock) ---- + std::chrono::high_resolution_clock::time_point t_bulk; + if (timings) t_bulk = std::chrono::high_resolution_clock::now(); + // ---- Copy compacted HitRecords to host ---- const size_t prev_rec = host_out.size(); host_out.resize(prev_rec + total_records); @@ -210,8 +259,13 @@ namespace OptixCSP scratch.d_offsets, n_hit_rays * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + + if (timings) + timings->bulk_dth_ms += std::chrono::duration( + std::chrono::high_resolution_clock::now() - t_bulk).count(); } + if (timings) ++timings->n_calls; return n_hit_rays; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h index 4b82fd41..ad2e7a43 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h @@ -23,6 +23,23 @@ namespace OptixCSP void *d_red_tmp = nullptr; // CUB DeviceReduce temp storage size_t red_bytes = 0; HitRecord *d_compacted = nullptr; // worst-case compacted output (num_rays * max_depth) + + // CUDA events for GPU-phase timing (non-null after allocate_compaction_scratch). + cudaEvent_t e_gpu1_start = nullptr; // before count/scan/reduce kernels + cudaEvent_t e_gpu1_stop = nullptr; // after count/scan/reduce kernels + cudaEvent_t e_gpu2_start = nullptr; // before compact/select kernels + cudaEvent_t e_gpu2_stop = nullptr; // after compact/select kernels + }; + + /// Per-call timing breakdown populated by gpu_compact_hit_buffer. + /// All times are accumulated (ms) across calls. Reset to {} at the start of each run(). + struct CompactionTimings + { + float gpu_phase1_ms = 0.f; // count + scan + reduce kernels (GPU time via CUDA events) + float scalar_dth_ms = 0.f; // 3x scalar D\u2192H cudaMemcpy (CPU wall-clock) + float gpu_phase2_ms = 0.f; // compact + select kernels (GPU time via CUDA events) + float bulk_dth_ms = 0.f; // HitRecord + ray-ID bulk D\u2192H (CPU wall-clock) + uint32_t n_calls = 0; // total gpu_compact_hit_buffer invocations }; /// Allocate all device scratch buffers for the given ray-buffer dimensions. @@ -36,6 +53,7 @@ namespace OptixCSP /// Uses pre-allocated scratch buffers — no device allocations occur inside this call. /// Appends compacted HitRecords to @p host_out and the corresponding global ray indices /// (ray_offset + local_ray_index) to @p host_ray_ids (one entry per logical hit ray). + /// Pass a non-null @p timings to accumulate per-phase timing (GPU events + CPU wall-clock). /// @returns Number of rays that produced at least one non-CREATE hit. uint32_t gpu_compact_hit_buffer( const HitRecord *d_hit_buffer, @@ -45,6 +63,7 @@ namespace OptixCSP std::vector &host_out, std::vector &host_ray_ids, cudaStream_t stream, - CompactionScratch &scratch); + CompactionScratch &scratch, + CompactionTimings *timings = nullptr); } // namespace OptixCSP diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 34a8e894..a4a3855e 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -283,6 +283,7 @@ void SolTraceSystem::run() m_timer_optix_launch.reset(); m_timer_collect_results.reset(); m_n_run_iterations = 0; + m_compaction_timings = CompactionTimings{}; // Allocate device buffers and initialize RNG states once (sizes are constant across the while loop). allocate_device_buffers(); @@ -676,7 +677,8 @@ void SolTraceSystem::get_buffer_results() m_hit_records, m_hit_ray_ids, m_state.stream, - m_compaction_scratch); + m_compaction_scratch, + &m_compaction_timings); m_n_hit_rays += n_new_hits; } @@ -738,6 +740,20 @@ void SolTraceSystem::print_timing() const std::cout << " Collect results : total = " << t_collect << " s" << " avg/iter = " << t_collect * inv_n << " s" << " (" << pct(t_collect, t_trace) << " %)\n"; + if (m_compaction_timings.n_calls > 0) + { + const float inv_c = 1.0f / static_cast(m_compaction_timings.n_calls); + std::cout << std::fixed << std::setprecision(4); + std::cout << " GPU pass 1 (count/scan/reduce) : total = " << m_compaction_timings.gpu_phase1_ms << " ms" + << " avg/call = " << m_compaction_timings.gpu_phase1_ms * inv_c << " ms\n"; + std::cout << " D->H scalars (3x memcpy) : total = " << m_compaction_timings.scalar_dth_ms << " ms" + << " avg/call = " << m_compaction_timings.scalar_dth_ms * inv_c << " ms\n"; + std::cout << " GPU pass 2 (compact/select) : total = " << m_compaction_timings.gpu_phase2_ms << " ms" + << " avg/call = " << m_compaction_timings.gpu_phase2_ms * inv_c << " ms\n"; + std::cout << " D->H bulk (records+ids) : total = " << m_compaction_timings.bulk_dth_ms << " ms" + << " avg/call = " << m_compaction_timings.bulk_dth_ms * inv_c << " ms\n"; + std::cout << std::fixed << std::setprecision(6); + } std::cout << " Total trace : " << t_trace << " s\n"; std::cout << "\n--- Grand Total ---\n"; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index cd2394ae..b4c9b1fc 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -150,7 +150,8 @@ namespace OptixCSP size_t m_sun_dir_buffer_size_allocated = 0; // Pre-allocated device scratch buffers for GPU stream compaction. - CompactionScratch m_compaction_scratch; + CompactionScratch m_compaction_scratch; + CompactionTimings m_compaction_timings; std::vector> m_element_list; void create_shader_binding_table(); From 5e752615bb730761799e5ad1ed210c0f8b240b3b Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Thu, 14 May 2026 15:30:32 -0600 Subject: [PATCH 28/60] Use page locked/pinned memory for device to host copies --- .../OptixCSP/src/core/ray_utils.cu | 32 +++++++++++-------- .../OptixCSP/src/core/ray_utils.h | 6 +++- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu index 109bebae..2d84e3be 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu @@ -9,6 +9,7 @@ #include #include +#include #include namespace OptixCSP @@ -124,6 +125,10 @@ namespace OptixCSP // Worst-case compacted output: every slot in the hit buffer could be kept CUDA_CHECK(cudaMalloc(&scratch.d_compacted, num_rays * max_depth * sizeof(HitRecord))); + // Pinned host staging buffers — avoids CUDA's internal small-chunk staging for pageable memory + CUDA_CHECK(cudaMallocHost(&scratch.h_compacted, num_rays * max_depth * sizeof(HitRecord))); + CUDA_CHECK(cudaMallocHost(&scratch.h_ray_ids, num_rays * sizeof(uint32_t))); + // CUDA events for GPU-phase timing CUDA_CHECK(cudaEventCreate(&scratch.e_gpu1_start)); CUDA_CHECK(cudaEventCreate(&scratch.e_gpu1_stop)); @@ -141,6 +146,9 @@ namespace OptixCSP cudaFree(scratch.d_scan_tmp); cudaFree(scratch.d_red_tmp); cudaFree(scratch.d_compacted); + // cudaFreeHost is nullptr-safe + cudaFreeHost(scratch.h_compacted); + cudaFreeHost(scratch.h_ray_ids); // cudaEventDestroy is not nullptr-safe if (scratch.e_gpu1_start) cudaEventDestroy(scratch.e_gpu1_start); if (scratch.e_gpu1_stop) cudaEventDestroy(scratch.e_gpu1_stop); @@ -238,27 +246,25 @@ namespace OptixCSP timings->gpu_phase2_ms += ms; } - // ---- D→H bulk memcpy (CPU wall-clock) ---- + // ---- D→H bulk: device → pinned staging (full PCIe bandwidth), then + // CPU memcpy pinned → std::vector (DRAM bandwidth) ---- std::chrono::high_resolution_clock::time_point t_bulk; if (timings) t_bulk = std::chrono::high_resolution_clock::now(); - // ---- Copy compacted HitRecords to host ---- + CUDA_CHECK(cudaMemcpy(scratch.h_compacted, scratch.d_compacted, + total_records * sizeof(HitRecord), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(scratch.h_ray_ids, scratch.d_offsets, + n_hit_rays * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + const size_t prev_rec = host_out.size(); host_out.resize(prev_rec + total_records); - CUDA_CHECK(cudaMemcpy( - host_out.data() + prev_rec, - scratch.d_compacted, - total_records * sizeof(HitRecord), - cudaMemcpyDeviceToHost)); + std::memcpy(host_out.data() + prev_rec, scratch.h_compacted, + total_records * sizeof(HitRecord)); - // ---- Copy global ray IDs to host (one per logical hit ray) ---- const size_t prev_ids = host_ray_ids.size(); host_ray_ids.resize(prev_ids + n_hit_rays); - CUDA_CHECK(cudaMemcpy( - host_ray_ids.data() + prev_ids, - scratch.d_offsets, - n_hit_rays * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); + std::memcpy(host_ray_ids.data() + prev_ids, scratch.h_ray_ids, + n_hit_rays * sizeof(uint32_t)); if (timings) timings->bulk_dth_ms += std::chrono::duration( diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h index ad2e7a43..42d1cae5 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h @@ -24,6 +24,10 @@ namespace OptixCSP size_t red_bytes = 0; HitRecord *d_compacted = nullptr; // worst-case compacted output (num_rays * max_depth) + // Pinned host staging buffers for fast D->H transfer (matched worst-case size). + HitRecord *h_compacted = nullptr; // pinned mirror of d_compacted + uint32_t *h_ray_ids = nullptr; // pinned mirror of d_offsets (ray ID output) + // CUDA events for GPU-phase timing (non-null after allocate_compaction_scratch). cudaEvent_t e_gpu1_start = nullptr; // before count/scan/reduce kernels cudaEvent_t e_gpu1_stop = nullptr; // after count/scan/reduce kernels @@ -36,7 +40,7 @@ namespace OptixCSP struct CompactionTimings { float gpu_phase1_ms = 0.f; // count + scan + reduce kernels (GPU time via CUDA events) - float scalar_dth_ms = 0.f; // 3x scalar D\u2192H cudaMemcpy (CPU wall-clock) + float scalar_dth_ms = 0.f; // 3x scalar D->H cudaMemcpy (CPU wall-clock) float gpu_phase2_ms = 0.f; // compact + select kernels (GPU time via CUDA events) float bulk_dth_ms = 0.f; // HitRecord + ray-ID bulk D\u2192H (CPU wall-clock) uint32_t n_calls = 0; // total gpu_compact_hit_buffer invocations From 9f0bcd1e61f214ecb1d643e77924c55b821fcd4d Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Thu, 14 May 2026 16:04:12 -0600 Subject: [PATCH 29/60] Revert page locked memory host memory as it hurt performance --- .../OptixCSP/src/core/ray_utils.cu | 32 ++++++++----------- .../OptixCSP/src/core/ray_utils.h | 4 --- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu index 2d84e3be..109bebae 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu @@ -9,7 +9,6 @@ #include #include -#include #include namespace OptixCSP @@ -125,10 +124,6 @@ namespace OptixCSP // Worst-case compacted output: every slot in the hit buffer could be kept CUDA_CHECK(cudaMalloc(&scratch.d_compacted, num_rays * max_depth * sizeof(HitRecord))); - // Pinned host staging buffers — avoids CUDA's internal small-chunk staging for pageable memory - CUDA_CHECK(cudaMallocHost(&scratch.h_compacted, num_rays * max_depth * sizeof(HitRecord))); - CUDA_CHECK(cudaMallocHost(&scratch.h_ray_ids, num_rays * sizeof(uint32_t))); - // CUDA events for GPU-phase timing CUDA_CHECK(cudaEventCreate(&scratch.e_gpu1_start)); CUDA_CHECK(cudaEventCreate(&scratch.e_gpu1_stop)); @@ -146,9 +141,6 @@ namespace OptixCSP cudaFree(scratch.d_scan_tmp); cudaFree(scratch.d_red_tmp); cudaFree(scratch.d_compacted); - // cudaFreeHost is nullptr-safe - cudaFreeHost(scratch.h_compacted); - cudaFreeHost(scratch.h_ray_ids); // cudaEventDestroy is not nullptr-safe if (scratch.e_gpu1_start) cudaEventDestroy(scratch.e_gpu1_start); if (scratch.e_gpu1_stop) cudaEventDestroy(scratch.e_gpu1_stop); @@ -246,25 +238,27 @@ namespace OptixCSP timings->gpu_phase2_ms += ms; } - // ---- D→H bulk: device → pinned staging (full PCIe bandwidth), then - // CPU memcpy pinned → std::vector (DRAM bandwidth) ---- + // ---- D→H bulk memcpy (CPU wall-clock) ---- std::chrono::high_resolution_clock::time_point t_bulk; if (timings) t_bulk = std::chrono::high_resolution_clock::now(); - CUDA_CHECK(cudaMemcpy(scratch.h_compacted, scratch.d_compacted, - total_records * sizeof(HitRecord), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(scratch.h_ray_ids, scratch.d_offsets, - n_hit_rays * sizeof(uint32_t), cudaMemcpyDeviceToHost)); - + // ---- Copy compacted HitRecords to host ---- const size_t prev_rec = host_out.size(); host_out.resize(prev_rec + total_records); - std::memcpy(host_out.data() + prev_rec, scratch.h_compacted, - total_records * sizeof(HitRecord)); + CUDA_CHECK(cudaMemcpy( + host_out.data() + prev_rec, + scratch.d_compacted, + total_records * sizeof(HitRecord), + cudaMemcpyDeviceToHost)); + // ---- Copy global ray IDs to host (one per logical hit ray) ---- const size_t prev_ids = host_ray_ids.size(); host_ray_ids.resize(prev_ids + n_hit_rays); - std::memcpy(host_ray_ids.data() + prev_ids, scratch.h_ray_ids, - n_hit_rays * sizeof(uint32_t)); + CUDA_CHECK(cudaMemcpy( + host_ray_ids.data() + prev_ids, + scratch.d_offsets, + n_hit_rays * sizeof(uint32_t), + cudaMemcpyDeviceToHost)); if (timings) timings->bulk_dth_ms += std::chrono::duration( diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h index 42d1cae5..5d7ea8fe 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h @@ -24,10 +24,6 @@ namespace OptixCSP size_t red_bytes = 0; HitRecord *d_compacted = nullptr; // worst-case compacted output (num_rays * max_depth) - // Pinned host staging buffers for fast D->H transfer (matched worst-case size). - HitRecord *h_compacted = nullptr; // pinned mirror of d_compacted - uint32_t *h_ray_ids = nullptr; // pinned mirror of d_offsets (ray ID output) - // CUDA events for GPU-phase timing (non-null after allocate_compaction_scratch). cudaEvent_t e_gpu1_start = nullptr; // before count/scan/reduce kernels cudaEvent_t e_gpu1_stop = nullptr; // after count/scan/reduce kernels From a117555c2ec0e8f8e89104dc8d4fa14d53d7f18f Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Thu, 14 May 2026 16:25:50 -0600 Subject: [PATCH 30/60] Add manual batch sizing to optix runner --- .../OptixCSP/src/core/soltrace_system.cpp | 7 ++++++- .../OptixCSP/src/core/soltrace_system.h | 14 ++++++++++++++ .../optix_runner/optix_runner.cpp | 10 ++++++++++ .../optix_runner/optix_runner.hpp | 3 +++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index a4a3855e..52edbeb1 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -19,6 +19,7 @@ #include "utils/math_util.h" #include +#include #include #include #include @@ -42,6 +43,7 @@ typedef Record HitGroupRecord; SolTraceSystem::SolTraceSystem() : m_number_of_rays(0), m_max_number_of_rays(0), + m_batch_size(0), m_verbose(false), m_mem_free_before(0), m_mem_free_after(0), @@ -606,7 +608,10 @@ void SolTraceSystem::create_shader_binding_table() void SolTraceSystem::allocate_device_buffers() { // Set constant launch params (unchanged across the while loop). - data_manager->launch_params_H.width = m_number_of_rays; + const uint_fast64_t effective_batch_raw = (m_batch_size > 0) ? m_batch_size : m_number_of_rays; + const uint_fast64_t effective_batch = std::min(effective_batch_raw, + static_cast(std::numeric_limits::max())); + data_manager->launch_params_H.width = static_cast(effective_batch); data_manager->launch_params_H.height = 1; data_manager->launch_params_H.max_depth = MAX_TRACE_DEPTH; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index b4c9b1fc..3fcd10ff 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -2,7 +2,9 @@ #include #include +#include #include +#include #include #include @@ -69,6 +71,17 @@ namespace OptixCSP m_max_number_of_rays = maxrays; } + /// Set the number of rays launched per iteration. Use 0 (default) to + /// launch all rays in a single batch (previous behaviour). + /// Throws std::out_of_range if batch_size exceeds the maximum int value. + void set_batch_size(uint_fast64_t batch_size) + { + if (batch_size > static_cast(std::numeric_limits::max())) + throw std::out_of_range("batch_size exceeds std::numeric_limits::max()"); + m_batch_size = batch_size; + } + uint_fast64_t get_batch_size() const { return m_batch_size; } + void set_sun(SolTrace::Data::Sun *sun) { m_sun = sun; } void set_seed(uint64_t seed) { m_seed = seed; } // Set sun seed @@ -114,6 +127,7 @@ namespace OptixCSP uint_fast64_t m_number_of_rays; uint_fast64_t m_max_number_of_rays; + uint_fast64_t m_batch_size = 0; // 0 means single batch (= m_number_of_rays) bool m_verbose; diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index 81673d3a..9f8dce3e 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -30,6 +30,16 @@ void OptixRunner::print_timing() const m_sys.print_timing(); } +void OptixRunner::set_batch_size(uint_fast64_t batch_size) +{ + m_sys.set_batch_size(batch_size); +} + +uint_fast64_t OptixRunner::get_batch_size() const +{ + return m_sys.get_batch_size(); +} + RunnerStatus OptixRunner::initialize() { // add elements to sys using data structure from SimulationData diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.hpp b/coretrace/simulation_runner/optix_runner/optix_runner.hpp index 6b3e772e..ee99c197 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.hpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.hpp @@ -38,6 +38,9 @@ class OptixRunner : public SolTrace::Runner::SimulationRunner void set_verbose(bool verbose); + void set_batch_size(uint_fast64_t batch_size); + uint_fast64_t get_batch_size() const; + // Runner options // void disable_sun_shape_errors() { this->include_sun_shape_errors = false; } // void enable_sun_shape_errors() { this->include_sun_shape_errors = true; } From a66795f6310ec08721dccacd58d27a14d9907ce2 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Thu, 14 May 2026 16:41:14 -0600 Subject: [PATCH 31/60] Add optix runner batching size tests --- .../OptixCSP/src/core/soltrace_system.h | 3 + .../optix_runner/optix_runner.cpp | 5 + .../optix_runner/optix_runner.hpp | 2 + .../optix_runner/CMakeLists.txt | 1 + .../optix_runner/batch_size_test.cpp | 132 ++++++++++++++++++ 5 files changed, 143 insertions(+) create mode 100644 google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index 3fcd10ff..c2d92fe1 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -113,6 +113,9 @@ namespace OptixCSP uint_fast64_t get_N_sun_rays() const { return m_n_sun_rays; } + /// Returns the number of run() iterations executed during the last run() call. + uint64_t get_N_run_iterations() const { return m_n_run_iterations; } + /// Returns the compacted hit records (CREATE + hits, misses excluded). const std::vector &get_hit_records() const { return m_hit_records; } diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index 9f8dce3e..dfbb277a 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -40,6 +40,11 @@ uint_fast64_t OptixRunner::get_batch_size() const return m_sys.get_batch_size(); } +uint64_t OptixRunner::get_N_run_iterations() const +{ + return m_sys.get_N_run_iterations(); +} + RunnerStatus OptixRunner::initialize() { // add elements to sys using data structure from SimulationData diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.hpp b/coretrace/simulation_runner/optix_runner/optix_runner.hpp index ee99c197..5f97c17c 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.hpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.hpp @@ -34,6 +34,8 @@ class OptixRunner : public SolTrace::Runner::SimulationRunner uint_fast64_t get_N_sun_rays() { return m_sys.get_N_sun_rays(); } + uint64_t get_N_run_iterations() const; + void print_timing() const; void set_verbose(bool verbose); diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt b/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt index 3103ba0e..c77660a2 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt +++ b/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt @@ -20,6 +20,7 @@ set(OPTIX_RUNNER_TEST_SRC flat_optical_test.cpp two_plate_test.cpp sun_test.cpp + batch_size_test.cpp ) add_executable(OptixRunnerUnitTests diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp new file mode 100644 index 00000000..fa34ee1e --- /dev/null +++ b/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp @@ -0,0 +1,132 @@ +#include + +#include +#include + +#include +#include +#include +#include + +using SolTrace::Runner::RunnerStatus; + +// Reuse the two-plate scene defined in two_plate_test.cpp +void make_two_plate_sd(SimulationData& sd, element_ptr& plate1, element_ptr& plate2); + +// --------------------------------------------------------------------------- +// set_batch_size / get_batch_size accessor tests (no GPU required) +// --------------------------------------------------------------------------- + +TEST(OptixRunnerBatchSize, DefaultIsZero) +{ + OptixRunner runner; + EXPECT_EQ(runner.get_batch_size(), 0u); +} + +TEST(OptixRunnerBatchSize, SetAndGet) +{ + OptixRunner runner; + runner.set_batch_size(500); + EXPECT_EQ(runner.get_batch_size(), 500u); +} + +TEST(OptixRunnerBatchSize, SetZeroRestoresDefault) +{ + OptixRunner runner; + runner.set_batch_size(1000); + runner.set_batch_size(0); + EXPECT_EQ(runner.get_batch_size(), 0u); +} + +TEST(OptixRunnerBatchSize, ThrowsOnOverflow) +{ + OptixRunner runner; + const uint_fast64_t too_large = + static_cast(std::numeric_limits::max()) + 1ULL; + EXPECT_THROW(runner.set_batch_size(too_large), std::out_of_range); + // Value should be unchanged after the throw + EXPECT_EQ(runner.get_batch_size(), 0u); +} + +// --------------------------------------------------------------------------- +// Simulation correctness: batched run should yield the same hit count as the +// default single-batch run. +// --------------------------------------------------------------------------- + +TEST(OptixRunnerBatchSize, BatchedRunMatchesHitCount) +{ + const int N_rays = 10000; + + // --- reference run (default single batch) --- + SimulationData sd_ref; + element_ptr p1_ref, p2_ref; + make_two_plate_sd(sd_ref, p1_ref, p2_ref); + sd_ref.get_simulation_parameters().number_of_rays = N_rays; + sd_ref.get_simulation_parameters().max_number_of_rays = N_rays * 100; + + OptixRunner ref_runner; + ASSERT_EQ(ref_runner.initialize(), RunnerStatus::SUCCESS); + ASSERT_EQ(ref_runner.setup_simulation(&sd_ref), RunnerStatus::SUCCESS); + ASSERT_EQ(ref_runner.run_simulation(), RunnerStatus::SUCCESS); + + SimulationResult ref_result; + ASSERT_EQ(ref_runner.report_simulation(&ref_result, 0), RunnerStatus::SUCCESS); + const int ref_hits = ref_result.get_number_of_records(); + + // --- batched run (batch_size = 1000, i.e. 10 iterations) --- + SimulationData sd_batch; + element_ptr p1_batch, p2_batch; + make_two_plate_sd(sd_batch, p1_batch, p2_batch); + sd_batch.get_simulation_parameters().number_of_rays = N_rays; + sd_batch.get_simulation_parameters().max_number_of_rays = N_rays * 100; + + OptixRunner batch_runner; + batch_runner.set_batch_size(1000); + ASSERT_EQ(batch_runner.initialize(), RunnerStatus::SUCCESS); + ASSERT_EQ(batch_runner.setup_simulation(&sd_batch), RunnerStatus::SUCCESS); + ASSERT_EQ(batch_runner.run_simulation(), RunnerStatus::SUCCESS); + + SimulationResult batch_result; + ASSERT_EQ(batch_runner.report_simulation(&batch_result, 0), RunnerStatus::SUCCESS); + const int batch_hits = batch_result.get_number_of_records(); + + EXPECT_EQ(ref_hits, N_rays); + EXPECT_EQ(batch_hits, N_rays); + + // Default (single-batch) run completes in one iteration; batched run needs more + EXPECT_EQ(ref_runner.get_N_run_iterations(), 1u); + EXPECT_GT(batch_runner.get_N_run_iterations(), 1u); +} + +// --------------------------------------------------------------------------- +// Batch size smaller than total rays forces multiple iterations. +// --------------------------------------------------------------------------- + +TEST(OptixRunnerBatchSize, SmallBatchMultipleIterations) +{ + const int N_rays = 5000; + const int batch = 500; // 10+ iterations needed + + SimulationData sd; + element_ptr plate1, plate2; + make_two_plate_sd(sd, plate1, plate2); + sd.get_simulation_parameters().number_of_rays = N_rays; + sd.get_simulation_parameters().max_number_of_rays = N_rays * 100; + + OptixRunner runner; + runner.set_batch_size(batch); + ASSERT_EQ(runner.initialize(), RunnerStatus::SUCCESS); + ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS); + ASSERT_EQ(runner.run_simulation(), RunnerStatus::SUCCESS); + + SimulationResult result; + ASSERT_EQ(runner.report_simulation(&result, 0), RunnerStatus::SUCCESS); + + EXPECT_EQ(result.get_number_of_records(), N_rays); + + // With a small batch the runner must have generated at least N_rays sun rays + EXPECT_GE(runner.get_N_sun_rays(), static_cast(N_rays)); + + // Multiple iterations must have been needed + EXPECT_GE(runner.get_N_run_iterations(), 10u); +} From 139896bc33037a9993fab7a1f7b5c05fd3c2d363 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Fri, 15 May 2026 08:33:58 -0600 Subject: [PATCH 32/60] Fix batch size test --- .../optix_runner/batch_size_test.cpp | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp index fa34ee1e..e50101cd 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp @@ -11,7 +11,7 @@ using SolTrace::Runner::RunnerStatus; // Reuse the two-plate scene defined in two_plate_test.cpp -void make_two_plate_sd(SimulationData& sd, element_ptr& plate1, element_ptr& plate2); +void make_two_plate_sd(SimulationData &sd, element_ptr &plate1, element_ptr &plate2); // --------------------------------------------------------------------------- // set_batch_size / get_batch_size accessor tests (no GPU required) @@ -65,9 +65,9 @@ TEST(OptixRunnerBatchSize, BatchedRunMatchesHitCount) sd_ref.get_simulation_parameters().max_number_of_rays = N_rays * 100; OptixRunner ref_runner; - ASSERT_EQ(ref_runner.initialize(), RunnerStatus::SUCCESS); - ASSERT_EQ(ref_runner.setup_simulation(&sd_ref), RunnerStatus::SUCCESS); - ASSERT_EQ(ref_runner.run_simulation(), RunnerStatus::SUCCESS); + ASSERT_EQ(ref_runner.initialize(), RunnerStatus::SUCCESS); + ASSERT_EQ(ref_runner.setup_simulation(&sd_ref), RunnerStatus::SUCCESS); + ASSERT_EQ(ref_runner.run_simulation(), RunnerStatus::SUCCESS); SimulationResult ref_result; ASSERT_EQ(ref_runner.report_simulation(&ref_result, 0), RunnerStatus::SUCCESS); @@ -82,9 +82,9 @@ TEST(OptixRunnerBatchSize, BatchedRunMatchesHitCount) OptixRunner batch_runner; batch_runner.set_batch_size(1000); - ASSERT_EQ(batch_runner.initialize(), RunnerStatus::SUCCESS); - ASSERT_EQ(batch_runner.setup_simulation(&sd_batch), RunnerStatus::SUCCESS); - ASSERT_EQ(batch_runner.run_simulation(), RunnerStatus::SUCCESS); + ASSERT_EQ(batch_runner.initialize(), RunnerStatus::SUCCESS); + ASSERT_EQ(batch_runner.setup_simulation(&sd_batch), RunnerStatus::SUCCESS); + ASSERT_EQ(batch_runner.run_simulation(), RunnerStatus::SUCCESS); SimulationResult batch_result; ASSERT_EQ(batch_runner.report_simulation(&batch_result, 0), RunnerStatus::SUCCESS); @@ -93,9 +93,8 @@ TEST(OptixRunnerBatchSize, BatchedRunMatchesHitCount) EXPECT_EQ(ref_hits, N_rays); EXPECT_EQ(batch_hits, N_rays); - // Default (single-batch) run completes in one iteration; batched run needs more - EXPECT_EQ(ref_runner.get_N_run_iterations(), 1u); - EXPECT_GT(batch_runner.get_N_run_iterations(), 1u); + // Batched run needs more than 10 iterations with 1000 batch size for 10000 rays + EXPECT_GE(batch_runner.get_N_run_iterations(), 10u); } // --------------------------------------------------------------------------- @@ -105,7 +104,7 @@ TEST(OptixRunnerBatchSize, BatchedRunMatchesHitCount) TEST(OptixRunnerBatchSize, SmallBatchMultipleIterations) { const int N_rays = 5000; - const int batch = 500; // 10+ iterations needed + const int batch = 500; // 10+ iterations needed SimulationData sd; element_ptr plate1, plate2; @@ -115,9 +114,9 @@ TEST(OptixRunnerBatchSize, SmallBatchMultipleIterations) OptixRunner runner; runner.set_batch_size(batch); - ASSERT_EQ(runner.initialize(), RunnerStatus::SUCCESS); - ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS); - ASSERT_EQ(runner.run_simulation(), RunnerStatus::SUCCESS); + ASSERT_EQ(runner.initialize(), RunnerStatus::SUCCESS); + ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS); + ASSERT_EQ(runner.run_simulation(), RunnerStatus::SUCCESS); SimulationResult result; ASSERT_EQ(runner.report_simulation(&result, 0), RunnerStatus::SUCCESS); From 1e9b4cc8f388996e6db402b994274b2a08924480 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Fri, 15 May 2026 09:39:26 -0600 Subject: [PATCH 33/60] Fix ctest extra lines printing --- .../OptixCSP/src/core/pipeline_manager.cpp | 12 +++++++++++- .../optix_runner/OptixCSP/src/utils/util_check.hpp | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp index c43e14eb..b012e95a 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp @@ -259,7 +259,12 @@ void pipelineManager::createSunProgram() desc.raygen.entryFunctionName = "__raygen__sun_source"; // Create the program group - OPTIX_CHECK_LOG(optixProgramGroupCreate( + // Note: OPTIX_CHECK_LOG is not used here because the macro creates its own + // local LOG_ buffer (not the global LOG), causing it to always print 2048 + // null bytes to stderr. Instead we use OPTIX_CHECK and manually print any + // non-empty log content when verbose mode is enabled. + LOG_SIZE = sizeof(LOG); + OPTIX_CHECK(optixProgramGroupCreate( m_state.context, // OptiX context. &desc, // Descriptor defining the program group. 1, // Number of program groups to create (1 in this case). @@ -267,6 +272,11 @@ void pipelineManager::createSunProgram() LOG, &LOG_SIZE, // Logs to capture diagnostic information. &group // Output: Handle for the created program group. )); + if (LOG_SIZE > 1 && LOG[0] != '\0') + { + std::cerr << "OptiX log for optixProgramGroupCreate (sun):\n" + << std::string(LOG, LOG + LOG_SIZE) << std::endl; + } m_program_groups.push_back(group); m_state.raygen_prog_group = group; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/utils/util_check.hpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/utils/util_check.hpp index c9247570..e8927199 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/utils/util_check.hpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/utils/util_check.hpp @@ -83,7 +83,7 @@ namespace OptixCSP { << std::string(log, log + log_size); throw std::runtime_error(oss.str()); } - else if (log_size > 1) + else if (log_size > 1 && log != nullptr && log[0] != '\0') { std::cerr << "OptiX log for " << func << ":\n" << std::string(log, log + log_size) << std::endl; From e088c17de28a3cdfe5b80ca409d8214054a02a59 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Fri, 15 May 2026 15:25:57 -0600 Subject: [PATCH 34/60] Remove profiling flags --- .../optix_runner/OptixCSP/src/CMakeLists.txt | 9 -------- .../OptixCSP/src/core/soltrace_system.cpp | 22 +------------------ 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/CMakeLists.txt b/coretrace/simulation_runner/optix_runner/OptixCSP/src/CMakeLists.txt index 5fe3c812..1b9591b7 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/CMakeLists.txt +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/CMakeLists.txt @@ -64,15 +64,6 @@ target_link_libraries(OptixCSP_core PRIVATE CUDA::cuda_driver CUDA::cudart) -# NVTX3 profiling support (header-only, ships with CUDA Toolkit >= 10.0) -if(TARGET CUDA::nvtx3) - target_link_libraries(OptixCSP_core PRIVATE CUDA::nvtx3) - target_compile_definitions(OptixCSP_core PRIVATE NVTX_ENABLED) - message(STATUS "NVTX3 profiling enabled for OptixCSP_core") -else() - message(STATUS "CUDA::nvtx3 not found; NVTX profiling annotations disabled") -endif() - # have to specify the Optix_INCLUDE directory for the CUDA compiler target_compile_options(OptixCSP_core PRIVATE $<$:--use_fast_math -lineinfo -I"${OptiX_INCLUDE}"> diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 52edbeb1..b9c00e8d 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -25,10 +25,6 @@ #include #include -#ifdef NVTX_ENABLED -#include -#endif - #include #include @@ -268,10 +264,6 @@ void SolTraceSystem::initialize() void SolTraceSystem::run() { -#ifdef NVTX_ENABLED - NVTX3_FUNC_RANGE(); -#endif - // Initialize results m_hit_records.clear(); m_n_hit_rays = 0; @@ -300,9 +292,6 @@ void SolTraceSystem::run() // Allocate buffer (sets data_manager->launch_params_H buffer) m_timer_setup_buffer.start(); { -#ifdef NVTX_ENABLED - nvtx3::scoped_range nvtx_setup{"setup_device_buffer"}; -#endif setup_device_buffer(); } m_timer_setup_buffer.stop(); @@ -320,9 +309,6 @@ void SolTraceSystem::run() // Launch the simulation. m_timer_optix_launch.start(); { -#ifdef NVTX_ENABLED - nvtx3::scoped_range nvtx_launch{"optixLaunch"}; -#endif OPTIX_CHECK(optixLaunch( m_state.pipeline, m_state.stream, // Assume this stream is properly created. @@ -333,15 +319,12 @@ void SolTraceSystem::run() height, 1)); CUDA_SYNC_CHECK(); - } // nvtx_launch + } m_timer_optix_launch.stop(); // Collect results m_timer_collect_results.start(); { -#ifdef NVTX_ENABLED - nvtx3::scoped_range nvtx_collect{"get_buffer_results"}; -#endif get_buffer_results(); } m_timer_collect_results.stop(); @@ -667,9 +650,6 @@ void SolTraceSystem::setup_device_buffer() // m_hit_records and m_n_hit_rays is incremented by the number of newly collected hit rays. void SolTraceSystem::get_buffer_results() { -#ifdef NVTX_ENABLED - NVTX3_FUNC_RANGE(); -#endif const uint32_t num_rays = static_cast(data_manager->launch_params_H.width * data_manager->launch_params_H.height); const uint32_t max_depth = static_cast(data_manager->launch_params_H.max_depth); From df683327b3a8f439e036e837dbe2d10da775c6a3 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Mon, 18 May 2026 09:30:01 -0600 Subject: [PATCH 35/60] Add automatic batch sizing based on available GPU memory --- .../OptixCSP/src/core/soltrace_system.cpp | 80 ++++++++++++++++++- .../OptixCSP/src/core/soltrace_system.h | 24 ++++-- .../optix_runner/batch_size_test.cpp | 64 +++++++++++++-- 3 files changed, 152 insertions(+), 16 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index b9c00e8d..c5e29033 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -159,7 +159,8 @@ void SolTraceSystem::initialize() sun_vec_norm = glm::normalize(sun_vec_norm); data_manager->launch_params_H.sun_vector = make_float3(static_cast(sun_vec_norm[0]), - static_cast(sun_vec_norm[1]), static_cast(sun_vec_norm[2])); + static_cast(sun_vec_norm[1]), + static_cast(sun_vec_norm[2])); // Set generation type switch (m_sun->get_gen_type()) @@ -591,9 +592,7 @@ void SolTraceSystem::create_shader_binding_table() void SolTraceSystem::allocate_device_buffers() { // Set constant launch params (unchanged across the while loop). - const uint_fast64_t effective_batch_raw = (m_batch_size > 0) ? m_batch_size : m_number_of_rays; - const uint_fast64_t effective_batch = std::min(effective_batch_raw, - static_cast(std::numeric_limits::max())); + const uint_fast64_t effective_batch = determine_batch_size(); data_manager->launch_params_H.width = static_cast(effective_batch); data_manager->launch_params_H.height = 1; data_manager->launch_params_H.max_depth = MAX_TRACE_DEPTH; @@ -758,3 +757,76 @@ double SolTraceSystem::get_sun_plane_area() const a.x * b.y - a.y * b.x); return static_cast(sqrtf(cross.x * cross.x + cross.y * cross.y + cross.z * cross.z)); } + +uint_fast64_t SolTraceSystem::automatic_batch_size() const +{ + // Query free GPU memory *after* the BVH and pipeline have been built so + // that only the ray-data buffers need to fit in the remaining space. + size_t mem_free, mem_total; + CUDA_CHECK(cudaMemGetInfo(&mem_free, &mem_total)); + + // Reserve 20 % headroom for OptiX internal allocations, memory + // fragmentation, and any other transient allocations during launch. + constexpr double kUsableFraction = 0.80; + const size_t usable_bytes = static_cast( + static_cast(mem_free) * kUsableFraction); + + // Per-ray device memory charged by allocate_device_buffers() and + // allocate_compaction_scratch(): + // hit_buffer MAX_TRACE_DEPTH * sizeof(HitRecord) -- trace output + // d_compacted MAX_TRACE_DEPTH * sizeof(HitRecord) -- worst-case compacted copy + // sun_dir_buffer sizeof(float3) -- sun ray direction + // curand states sizeof(curandState) -- RNG state + // d_count sizeof(uint32_t) -- compaction hit count + // d_offsets sizeof(uint32_t) -- compaction prefix sum + // d_has_hit sizeof(uint32_t) -- per-ray hit flag + const size_t bytes_per_ray = + 2u * MAX_TRACE_DEPTH * sizeof(HitRecord) + sizeof(float3) + sizeof(curandState) + 3u * sizeof(uint32_t); + + const uint_fast64_t computed = + (bytes_per_ray > 0) ? static_cast(usable_bytes / bytes_per_ray) : 0u; + + // Cap at int max (OptiX launch width is signed int). + uint_fast64_t batch_size = std::min( + computed, + static_cast(std::numeric_limits::max())); + + if (m_verbose) + { + std::cout << "automatic_batch_size:" + << " free=" << mem_free / (1024.0 * 1024.0) << " MB" + << ", usable=" << usable_bytes / (1024.0 * 1024.0) << " MB" + << ", bytes_per_ray=" << bytes_per_ray + << ", batch_size=" << batch_size << "\n"; + } + + return batch_size; +} + +uint_fast64_t SolTraceSystem::determine_batch_size() const +{ + // Estimates number of rays that can be traced in a single batch based on + // available GPU memory. + uint_fast64_t batch_size = automatic_batch_size(); + + if (m_batch_size > 0) + { + if (m_batch_size > batch_size && batch_size > 0) + { + std::cerr << "[SolTraceSystem] WARNING: user-supplied batch_size (" + << m_batch_size + << ") exceeds the GPU-memory-safe automatic batch size (" + << batch_size + << "). This may cause device out-of-memory errors or " + "degraded GPU performance.\n"; + } + batch_size = m_batch_size; + } + else + { + // Take the smaller of the automatic batch_size and number of rays? + batch_size = batch_size > 0 ? std::min(batch_size, m_number_of_rays) : m_number_of_rays; + } + + return batch_size; +} diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index c2d92fe1..c71fabed 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -71,8 +71,9 @@ namespace OptixCSP m_max_number_of_rays = maxrays; } - /// Set the number of rays launched per iteration. Use 0 (default) to - /// launch all rays in a single batch (previous behaviour). + /// Set the number of rays launched per iteration. + /// Use 0 (default) to let determine_batch_size() automatically compute a + /// batch size that fits the ray-data buffers in available GPU memory. /// Throws std::out_of_range if batch_size exceeds the maximum int value. void set_batch_size(uint_fast64_t batch_size) { @@ -130,7 +131,7 @@ namespace OptixCSP uint_fast64_t m_number_of_rays; uint_fast64_t m_max_number_of_rays; - uint_fast64_t m_batch_size = 0; // 0 means single batch (= m_number_of_rays) + uint_fast64_t m_batch_size = 0; // 0 means auto-size: determine_batch_size() calls automatic_batch_size() bool m_verbose; @@ -150,11 +151,11 @@ namespace OptixCSP // Compacted hit records: one contiguous array of HitRecord. // Each ray group starts with a HIT_CREATE record followed by its hits. // Rays that produced no hits (CREATE-only) are excluded. - std::vector m_hit_records; + std::vector m_hit_records; // Global ray index (ray_offset + local_index) for each logical hit ray in m_hit_records. // Parallel to the logical rays (not records): m_hit_ray_ids.size() == m_n_hit_rays. - std::vector m_hit_ray_ids; + std::vector m_hit_ray_ids; // Count of rays that produced at least one non-CREATE hit. uint_fast64_t m_n_hit_rays = 0; @@ -167,8 +168,8 @@ namespace OptixCSP size_t m_sun_dir_buffer_size_allocated = 0; // Pre-allocated device scratch buffers for GPU stream compaction. - CompactionScratch m_compaction_scratch; - CompactionTimings m_compaction_timings; + CompactionScratch m_compaction_scratch; + CompactionTimings m_compaction_timings; std::vector> m_element_list; void create_shader_binding_table(); @@ -177,6 +178,15 @@ namespace OptixCSP // GPU-side compaction: count hits, compact buffer on device, copy result to m_hit_records. // Increments m_n_hit_rays by the number of newly collected hit rays. void get_buffer_results(); + /// Computes the maximum rays-per-batch that fit in 80 % of current free + /// GPU memory, accounting for all per-ray device buffers and compaction + /// scratch. Returns 0 if memory cannot be queried. + uint_fast64_t automatic_batch_size() const; + /// Returns the effective batch size for a run() call. + /// If m_batch_size > 0 the user-supplied value is used as-is. + /// Otherwise automatic_batch_size() is called and the result is capped + /// at m_number_of_rays. + uint_fast64_t determine_batch_size() const; Timer m_timer_setup; Timer m_timer_trace; diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp index e50101cd..127e032f 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp @@ -17,7 +17,10 @@ void make_two_plate_sd(SimulationData &sd, element_ptr &plate1, element_ptr &pla // set_batch_size / get_batch_size accessor tests (no GPU required) // --------------------------------------------------------------------------- -TEST(OptixRunnerBatchSize, DefaultIsZero) +// Default value of 0 means automatic batch sizing: determine_batch_size() will +// call automatic_batch_size() to compute a GPU-memory-safe batch size at run +// time. It does NOT mean "launch all rays in a single batch". +TEST(OptixRunnerBatchSize, DefaultIsZeroMeansAutoSize) { OptixRunner runner; EXPECT_EQ(runner.get_batch_size(), 0u); @@ -30,7 +33,8 @@ TEST(OptixRunnerBatchSize, SetAndGet) EXPECT_EQ(runner.get_batch_size(), 500u); } -TEST(OptixRunnerBatchSize, SetZeroRestoresDefault) +// Setting back to 0 restores automatic GPU-memory-based sizing. +TEST(OptixRunnerBatchSize, SetZeroRestoresAutoSize) { OptixRunner runner; runner.set_batch_size(1000); @@ -48,6 +52,27 @@ TEST(OptixRunnerBatchSize, ThrowsOnOverflow) EXPECT_EQ(runner.get_batch_size(), 0u); } +// INT_MAX itself is the largest valid batch size; it must not throw. +TEST(OptixRunnerBatchSize, MaxIntBoundaryDoesNotThrow) +{ + OptixRunner runner; + const uint_fast64_t max_valid = + static_cast(std::numeric_limits::max()); + EXPECT_NO_THROW(runner.set_batch_size(max_valid)); + EXPECT_EQ(runner.get_batch_size(), max_valid); +} + +// A failed set_batch_size must not corrupt a previously stored non-zero value. +TEST(OptixRunnerBatchSize, ThrowPreservesExistingValue) +{ + OptixRunner runner; + runner.set_batch_size(999); + const uint_fast64_t too_large = + static_cast(std::numeric_limits::max()) + 1ULL; + EXPECT_THROW(runner.set_batch_size(too_large), std::out_of_range); + EXPECT_EQ(runner.get_batch_size(), 999u); +} + // --------------------------------------------------------------------------- // Simulation correctness: batched run should yield the same hit count as the // default single-batch run. @@ -57,7 +82,8 @@ TEST(OptixRunnerBatchSize, BatchedRunMatchesHitCount) { const int N_rays = 10000; - // --- reference run (default single batch) --- + // --- reference run (default auto-sized batch: m_batch_size == 0 defers to + // determine_batch_size() / automatic_batch_size()) --- SimulationData sd_ref; element_ptr p1_ref, p2_ref; make_two_plate_sd(sd_ref, p1_ref, p2_ref); @@ -73,7 +99,7 @@ TEST(OptixRunnerBatchSize, BatchedRunMatchesHitCount) ASSERT_EQ(ref_runner.report_simulation(&ref_result, 0), RunnerStatus::SUCCESS); const int ref_hits = ref_result.get_number_of_records(); - // --- batched run (batch_size = 1000, i.e. 10 iterations) --- + // --- explicit batched run (user-supplied batch_size = 1000, ~10 iterations) --- SimulationData sd_batch; element_ptr p1_batch, p2_batch; make_two_plate_sd(sd_batch, p1_batch, p2_batch); @@ -93,7 +119,8 @@ TEST(OptixRunnerBatchSize, BatchedRunMatchesHitCount) EXPECT_EQ(ref_hits, N_rays); EXPECT_EQ(batch_hits, N_rays); - // Batched run needs more than 10 iterations with 1000 batch size for 10000 rays + // With a user-supplied batch of 1000 and 10000 rays, at least 10 iterations + // are required regardless of available GPU memory. EXPECT_GE(batch_runner.get_N_run_iterations(), 10u); } @@ -129,3 +156,30 @@ TEST(OptixRunnerBatchSize, SmallBatchMultipleIterations) // Multiple iterations must have been needed EXPECT_GE(runner.get_N_run_iterations(), 10u); } + +// --------------------------------------------------------------------------- +// Batch size >= N_rays: should complete in exactly one iteration. +// --------------------------------------------------------------------------- + +TEST(OptixRunnerBatchSize, BatchSizeExceedingRaysCompletesInOneIteration) +{ + const int N_rays = 1000; + + SimulationData sd; + element_ptr plate1, plate2; + make_two_plate_sd(sd, plate1, plate2); + sd.get_simulation_parameters().number_of_rays = N_rays; + sd.get_simulation_parameters().max_number_of_rays = N_rays * 100; + + OptixRunner runner; + runner.set_batch_size(N_rays * 2); // larger than N_rays + ASSERT_EQ(runner.initialize(), RunnerStatus::SUCCESS); + ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS); + ASSERT_EQ(runner.run_simulation(), RunnerStatus::SUCCESS); + + SimulationResult result; + ASSERT_EQ(runner.report_simulation(&result, 0), RunnerStatus::SUCCESS); + + EXPECT_EQ(result.get_number_of_records(), N_rays); + EXPECT_EQ(runner.get_N_run_iterations(), 1u); +} From ea4dd5feb96a03d118789b4c4e8c154625459e29 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Mon, 18 May 2026 09:52:43 -0600 Subject: [PATCH 36/60] Change hit flag buffer from 32-bit to 8-bit --- .../optix_runner/OptixCSP/src/core/ray_utils.cu | 13 +++++++------ .../optix_runner/OptixCSP/src/core/ray_utils.h | 2 +- .../OptixCSP/src/core/soltrace_system.cpp | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu index 109bebae..284a1c2a 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu @@ -28,7 +28,7 @@ namespace OptixCSP uint32_t num_rays, uint32_t max_depth, uint32_t *__restrict__ out_record_count, - uint32_t *__restrict__ out_has_hit) + uint8_t *__restrict__ out_has_hit) { const uint32_t ray = blockIdx.x * blockDim.x + threadIdx.x; if (ray >= num_rays) @@ -47,7 +47,7 @@ namespace OptixCSP const uint32_t has_hit = (raw_count > 1) ? 1u : 0u; out_record_count[ray] = has_hit ? raw_count : 0u; - out_has_hit[ray] = has_hit; + out_has_hit[ray] = static_cast(has_hit); return; } @@ -65,7 +65,7 @@ namespace OptixCSP uint32_t num_rays, uint32_t max_depth, const uint32_t *__restrict__ offsets, - const uint32_t *__restrict__ has_hit, + const uint8_t *__restrict__ has_hit, HitRecord *__restrict__ out_buffer) { const uint32_t ray = blockIdx.x * blockDim.x + threadIdx.x; @@ -103,18 +103,19 @@ namespace OptixCSP CUDA_CHECK(cudaMalloc(&scratch.d_count, num_rays * sizeof(uint32_t))); CUDA_CHECK(cudaMalloc(&scratch.d_offsets, num_rays * sizeof(uint32_t))); - CUDA_CHECK(cudaMalloc(&scratch.d_has_hit, num_rays * sizeof(uint32_t))); + CUDA_CHECK(cudaMalloc(&scratch.d_has_hit, num_rays * sizeof(uint8_t))); CUDA_CHECK(cudaMalloc(&scratch.d_n_hit, sizeof(uint32_t))); // Query CUB temp-storage sizes using typed null pointers (size query only). // scan_bytes must cover both ExclusiveSum and DeviceSelect::Flagged (d_scan_tmp is reused). uint32_t *null_u32 = nullptr; + uint8_t *null_u8 = nullptr; cub::DeviceScan::ExclusiveSum(scratch.d_scan_tmp, scratch.scan_bytes, null_u32, null_u32, num_rays); - cub::DeviceReduce::Sum(scratch.d_red_tmp, scratch.red_bytes, null_u32, null_u32, num_rays); + cub::DeviceReduce::Sum(scratch.d_red_tmp, scratch.red_bytes, null_u8, null_u32, num_rays); size_t select_bytes = 0; thrust::counting_iterator count_iter(0u); - cub::DeviceSelect::Flagged(nullptr, select_bytes, count_iter, null_u32, null_u32, null_u32, num_rays); + cub::DeviceSelect::Flagged(nullptr, select_bytes, count_iter, null_u8, null_u32, null_u32, num_rays); if (select_bytes > scratch.scan_bytes) scratch.scan_bytes = select_bytes; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h index 5d7ea8fe..ed1a33c4 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h @@ -16,7 +16,7 @@ namespace OptixCSP { uint32_t *d_count = nullptr; // per-ray output record count uint32_t *d_offsets = nullptr; // exclusive prefix-sum of d_count - uint32_t *d_has_hit = nullptr; // 1 if ray contributes records, else 0 + uint8_t *d_has_hit = nullptr; // 1 if ray contributes records, else 0 uint32_t *d_n_hit = nullptr; // scalar: total hit rays void *d_scan_tmp = nullptr; // CUB DeviceScan temp storage size_t scan_bytes = 0; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index c5e29033..fd282267 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -779,9 +779,9 @@ uint_fast64_t SolTraceSystem::automatic_batch_size() const // curand states sizeof(curandState) -- RNG state // d_count sizeof(uint32_t) -- compaction hit count // d_offsets sizeof(uint32_t) -- compaction prefix sum - // d_has_hit sizeof(uint32_t) -- per-ray hit flag + // d_has_hit sizeof(uint8_t) -- per-ray hit flag const size_t bytes_per_ray = - 2u * MAX_TRACE_DEPTH * sizeof(HitRecord) + sizeof(float3) + sizeof(curandState) + 3u * sizeof(uint32_t); + 2u * MAX_TRACE_DEPTH * sizeof(HitRecord) + sizeof(float3) + sizeof(curandState) + 2u * sizeof(uint32_t) + sizeof(uint8_t); const uint_fast64_t computed = (bytes_per_ray > 0) ? static_cast(usable_bytes / bytes_per_ray) : 0u; From 2226b24348eb9ae560c5877186ed8a65c33bc093 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Mon, 18 May 2026 10:24:42 -0600 Subject: [PATCH 37/60] Remove dead includes in files; update batch size test --- .../optix_runner/OptixCSP/src/core/CspElement.cpp | 1 - .../optix_runner/OptixCSP/src/core/data_manager.cpp | 3 --- .../optix_runner/OptixCSP/src/core/geometry_manager.cpp | 1 - .../optix_runner/OptixCSP/src/core/soltrace_system.cpp | 6 +----- .../optix_runner/OptixCSP/src/core/sun_utils.cu | 1 - .../simulation_runner/optix_runner/batch_size_test.cpp | 2 +- 6 files changed, 2 insertions(+), 12 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp index 21f40049..7ade3585 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/CspElement.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include "vec3d.h" diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp index 3e93a66f..d11021ac 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp @@ -1,10 +1,7 @@ #include "data_manager.h" #include "soltrace_system.h" #include "utils/util_check.hpp" -#include -#include #include -#include #include #include #include "sun_utils.h" diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp index cc51276a..77c713fa 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/geometry_manager.cpp @@ -3,7 +3,6 @@ #include "sun_utils.h" #include "soltrace_state.h" #include "utils/util_check.hpp" -#include "data_manager.h" #include diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index fd282267..0a539a7e 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -9,9 +9,6 @@ #include "soltrace_type.h" #include "timer.h" -#include "../../../../../simulation_data/simdata_io.hpp" -#include "../../../../../simulation_data/solar_position_calculators/basic_sun_position.hpp" - #include "shaders/Soltrace.h" #include "utils/util_record.hpp" @@ -19,11 +16,10 @@ #include "utils/math_util.h" #include -#include -#include #include #include #include +#include #include #include diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/sun_utils.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/sun_utils.cu index 739c2075..798b288d 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/sun_utils.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/sun_utils.cu @@ -6,7 +6,6 @@ #include "shaders/Soltrace.h" #include -#include namespace OptixCSP { diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp index 127e032f..7bd5d058 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/batch_size_test.cpp @@ -172,7 +172,7 @@ TEST(OptixRunnerBatchSize, BatchSizeExceedingRaysCompletesInOneIteration) sd.get_simulation_parameters().max_number_of_rays = N_rays * 100; OptixRunner runner; - runner.set_batch_size(N_rays * 2); // larger than N_rays + runner.set_batch_size(N_rays * 8); // larger than N_rays ASSERT_EQ(runner.initialize(), RunnerStatus::SUCCESS); ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS); ASSERT_EQ(runner.run_simulation(), RunnerStatus::SUCCESS); From 65713565ce597e7cc8117c1c8c86fd41ad380603 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Mon, 18 May 2026 12:12:55 -0600 Subject: [PATCH 38/60] Add post simulation calls for number of rays launched and number of rays traced to the runner api; implement and test these new function; add output statements to simdriver --- coretrace/simdriver/main.cpp | 42 ++++++---- .../native_runner/native_runner.hpp | 12 +++ .../optix_runner/optix_runner.hpp | 6 +- .../simulation_runner/simulation_runner.hpp | 3 + .../native_runner/native_runner_test.cpp | 53 +++++++++++++ .../optix_runner/gpu_tower_demo.cpp | 77 +++++++++++++++++++ 6 files changed, 177 insertions(+), 16 deletions(-) diff --git a/coretrace/simdriver/main.cpp b/coretrace/simdriver/main.cpp index 11d02cbb..b1cd360d 100644 --- a/coretrace/simdriver/main.cpp +++ b/coretrace/simdriver/main.cpp @@ -13,6 +13,8 @@ * --rays Override the number of rays from the JSON file * --no-output Skip result retrieval and CSV output (output file not * required when this flag is set) + * --no-csv Retrieve results but skip writing the CSV file (output + * file argument not required when this flag is set) * --embree Use the Embree runner (only available if built with * SOLTRACE_BUILD_EMBREE_SUPPORT=ON; falls back to native * runner with a warning if Embree support is absent) @@ -57,6 +59,8 @@ static void print_usage(const char *prog) << " --rays Override number of rays specified in the JSON file\n" << " --no-output Skip result retrieval and CSV output\n" << " (output file argument not required with this flag)\n" + << " --no-csv Retrieve results but skip writing the CSV file\n" + << " (output file argument not required with this flag)\n" #ifdef SOLTRACE_EMBREE_SUPPORT << " --embree Use Embree runner instead of the native runner\n" << " (requires SOLTRACE_BUILD_EMBREE_SUPPORT=ON at build time)\n" @@ -77,27 +81,27 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } - // Pre-scan for --no-output so we know whether output_file is required + // Pre-scan for --no-output and --no-csv so we know whether output_file is required bool skip_output = false; + bool skip_csv = false; for (int i = 2; i < argc; ++i) { - if (std::string(argv[i]) == "--no-output") - { - skip_output = true; - break; - } + const std::string a = argv[i]; + if (a == "--no-output") skip_output = true; + else if (a == "--no-csv") skip_csv = true; } - if (!skip_output && argc < 3) + const bool file_optional = skip_output || skip_csv; + if (!file_optional && argc < 3) { - std::cerr << "Error: output file is required unless --no-output is specified\n"; + std::cerr << "Error: output file is required unless --no-output or --no-csv is specified\n"; print_usage(argv[0]); return EXIT_FAILURE; } const std::string input_file = argv[1]; - // output_file is only meaningful when skip_output is false - const std::string output_file = (!skip_output && argc >= 3) ? argv[2] : ""; + // output_file is only meaningful when neither skip_output nor skip_csv is set + const std::string output_file = (!file_optional && argc >= 3) ? argv[2] : ""; int num_threads = 1; long long num_rays_override = -1; // -1 means use what the JSON specifies @@ -105,8 +109,8 @@ int main(int argc, char *argv[]) bool use_optix = false; bool verbose = false; - // Start parsing options from argv[2] if skip_output, else from argv[3] - const int opts_start = skip_output ? 2 : 3; + // Start parsing options from argv[2] if output file is omitted, else from argv[3] + const int opts_start = file_optional ? 2 : 3; for (int i = opts_start; i < argc; ++i) { const std::string arg = argv[i]; @@ -154,7 +158,7 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } } - else if (arg == "--no-output") + else if (arg == "--no-output" || arg == "--no-csv") { // already handled in pre-scan; skip here } @@ -285,6 +289,8 @@ int main(int argc, char *argv[]) std::cout << " Completed in " << std::chrono::duration(t_run_end - t_run_start).count() << " s\n"; + std::cout << " Rays launched: " << runner.get_number_rays_launched() << "\n"; + std::cout << " Rays traced: " << runner.get_number_rays_traced() << "\n"; if (!skip_output) { @@ -349,6 +355,8 @@ int main(int argc, char *argv[]) std::cout << " Completed in " << std::chrono::duration(t_run_end - t_run_start).count() << " s\n"; + std::cout << " Rays launched: " << runner.get_number_rays_launched() << "\n"; + std::cout << " Rays traced: " << runner.get_number_rays_traced() << "\n"; if (!skip_output) { @@ -430,6 +438,8 @@ int main(int argc, char *argv[]) std::cout << " Completed in " << std::chrono::duration(t_run_end - t_run_start).count() << " s\n"; + std::cout << " Rays launched: " << runner.get_number_rays_launched() << "\n"; + std::cout << " Rays traced: " << runner.get_number_rays_traced() << "\n"; if (!skip_output) { @@ -455,7 +465,7 @@ int main(int argc, char *argv[]) // ------------------------------------------------------------------------- // Write results to CSV // ------------------------------------------------------------------------- - if (!skip_output) + if (!skip_output && !skip_csv) { std::cout << "Writing " << result.get_number_of_records() << " ray records to: " << output_file << "...\n"; @@ -474,6 +484,10 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } } + else if (skip_csv) + { + std::cout << "Skipping CSV output (--no-csv).\n"; + } else { std::cout << "Skipping CSV output (--no-output).\n"; diff --git a/coretrace/simulation_runner/native_runner/native_runner.hpp b/coretrace/simulation_runner/native_runner/native_runner.hpp index 800b4d2e..4e8387f7 100644 --- a/coretrace/simulation_runner/native_runner/native_runner.hpp +++ b/coretrace/simulation_runner/native_runner/native_runner.hpp @@ -39,6 +39,18 @@ namespace SolTrace::NativeRunner virtual RunnerStatus report_simulation(SolTrace::Result::SimulationResult *result, int level_spec) override; + virtual uint_fast64_t get_number_rays_launched() const override + { + return tsys.SunRayCount; + } + virtual uint_fast64_t get_number_rays_traced() const override + { + // TODO: This could be wrong if we hit max number of rays before getting this many hits. + // At the moment max number of rays is ignored though... + // return tsys.sim_raycount; + return tsys.SunRayCount > 0 ? tsys.sim_raycount : 0; + } + // Runner options void disable_power_tower() { this->as_power_tower = false; } void enable_power_tower() { this->as_power_tower = true; } diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.hpp b/coretrace/simulation_runner/optix_runner/optix_runner.hpp index 5f97c17c..8920244a 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.hpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.hpp @@ -30,9 +30,11 @@ class OptixRunner : public SolTrace::Runner::SimulationRunner SolTrace::Runner::RunnerStatus get_hp_output(std::vector& hp_vec, std::vector& raynumber_vec, std::vector& element_id_vec); - double get_sun_plane_area() { return m_sys.get_sun_plane_area(); } + double get_sun_plane_area() const { return m_sys.get_sun_plane_area(); } - uint_fast64_t get_N_sun_rays() { return m_sys.get_N_sun_rays(); } + uint_fast64_t get_N_sun_rays() const { return m_sys.get_N_sun_rays(); } + inline uint_fast64_t get_number_rays_launched() const override {return get_N_sun_rays(); } + inline uint_fast64_t get_number_rays_traced() const override {return m_sys.get_N_hit_rays(); } uint64_t get_N_run_iterations() const; diff --git a/coretrace/simulation_runner/simulation_runner.hpp b/coretrace/simulation_runner/simulation_runner.hpp index 91c2c7e2..e8e21ba7 100644 --- a/coretrace/simulation_runner/simulation_runner.hpp +++ b/coretrace/simulation_runner/simulation_runner.hpp @@ -64,6 +64,9 @@ namespace SolTrace::Runner virtual RunnerStatus report_simulation(SolTrace::Result::SimulationResult *result, int level_spec) = 0; + virtual uint_fast64_t get_number_rays_launched() const = 0; + virtual uint_fast64_t get_number_rays_traced() const = 0; + private: }; diff --git a/google-tests/unit-tests/simulation_runner/native_runner/native_runner_test.cpp b/google-tests/unit-tests/simulation_runner/native_runner/native_runner_test.cpp index 9dda077d..c23f0e9b 100644 --- a/google-tests/unit-tests/simulation_runner/native_runner/native_runner_test.cpp +++ b/google-tests/unit-tests/simulation_runner/native_runner/native_runner_test.cpp @@ -236,6 +236,59 @@ TEST(NativeRunner, SmokeTest) << std::endl; } +TEST(NativeRunner, RaysLaunchedEqualsRequestedAfterRun) +{ + const uint_fast64_t NRAYS = 10; + NativeRunner runner; + SimulationData my_sim; + + SimulationParameters ¶ms = my_sim.get_simulation_parameters(); + params.include_optical_errors = false; + params.include_sun_shape_errors = false; + params.number_of_rays = NRAYS; + params.max_number_of_rays = 10 * NRAYS; + + auto sun = SolTrace::Data::make_ray_source(); + sun->set_position(0.0, 0.0, 100.0); + sun->set_shape(SolTrace::Data::SunShape::GAUSSIAN, 1.0, -5.0, 0.0); + my_sim.add_ray_source(sun); + + auto my_st = SolTrace::Data::make_stage(0); + const int NUM_ELEMENTS = 4; + double x[NUM_ELEMENTS] = {1.0, 0.0, -1.0, 0.0}; + double y[NUM_ELEMENTS] = {0.0, 1.0, 0.0, -1.0}; + OpticalProperties optics(SolTrace::Data::InteractionType::REFLECTION, + SolTrace::Data::DistributionType::GAUSSIAN, + 0.0, 1.0, 0.0, 0.0, 1.0, 1.0); + for (int k = 0; k < NUM_ELEMENTS; ++k) + { + element_ptr el = SolTrace::Data::make_element(); + el->set_aperture(SolTrace::Data::make_aperture(2.0)); + el->set_surface(SolTrace::Data::make_surface()); + el->set_reference_frame_geometry(glm::dvec3(x[k], y[k], 0.0), + glm::dvec3(-x[k], -y[k], 1.0), + 0.0); + el->set_front_optical_properties(optics); + el->set_back_optical_properties(optics); + my_st->add_element(el); + } + my_sim.add_stage(my_st); + + RunnerStatus sts = runner.initialize(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.setup_simulation(&my_sim); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + + // Before running, SunRayCount has not been accumulated yet + EXPECT_EQ(runner.get_number_rays_launched(), static_cast(0)); + + sts = runner.run_simulation(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + + EXPECT_GE(runner.get_number_rays_launched(), NRAYS); + EXPECT_EQ(runner.get_number_rays_traced(), NRAYS); +} + TEST(NativeRunner, PowerTowerSmokeTest) { SimulationData sd; diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/gpu_tower_demo.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/gpu_tower_demo.cpp index ea771d5a..f99fb8e8 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/gpu_tower_demo.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/gpu_tower_demo.cpp @@ -159,3 +159,80 @@ TEST(GpuTowerDemo, OptixRunnerWithStages) // sts = runner.report_simulation(); ASSERT_EQ(sts, RunnerStatus::SUCCESS); } + +static void setup_tower_sd(SimulationData &sd, uint_fast64_t nrays) +{ + auto sun = make_ray_source(); + sun->set_position(0.0, 0.0, 100.0); + sd.add_ray_source(sun); + + auto absorber = make_element(); + absorber->set_origin(0.0, 0.0, 10.0); + absorber->set_aim_vector(0.0, 5.0, 0.0); + absorber->set_surface(make_surface()); + absorber->set_aperture(make_aperture(2.0, 2.0)); + absorber->get_front_optical_properties()->set_ideal_absorption(); + + auto st1 = make_stage(1); + st1->set_origin(0.0, 0.0, 0.0); + st1->set_aim_vector(0.0, 0.0, 1.0); + st1->add_element(absorber); + + auto st0 = make_stage(0); + st0->set_origin(0.0, 0.0, 0.0); + st0->set_aim_vector(0.0, 0.0, 1.0); + + const double spacing = PI / 4.0; + for (int i = -1; i < 4; ++i) + { + auto el = make_element(); + el->get_front_optical_properties()->reflectivity = 1.0; + + glm::dvec3 pos = {5 * sin(i * spacing), 5 * cos(i * spacing), 0.0}; + el->set_origin(pos); + glm::dvec3 rvec = glm::normalize(absorber->get_origin_global() - pos); + glm::dvec3 svec = glm::normalize(sun->get_position()); + glm::dvec3 avec = 0.5 * rvec + 0.5 * svec; + el->set_aim_vector(pos + 100.0 * avec); + el->set_zrot(30.0 * i); + el->set_surface(make_surface()); + el->set_aperture(make_aperture(1.0, 1.95)); + st0->add_element(el); + } + + sd.add_stage(st0); + sd.add_stage(st1); + + SimulationParameters ¶ms = sd.get_simulation_parameters(); + params.number_of_rays = nrays; + params.max_number_of_rays = nrays * 100; + params.include_optical_errors = false; + params.include_sun_shape_errors = false; + params.seed = 12345; +} + +TEST(OptixRunner, RaysLaunchedEqualsRequestedAfterRun) +{ + const uint_fast64_t NRAYS = 100; + SimulationData sd; + setup_tower_sd(sd, NRAYS); + + OptixRunner runner; + RunnerStatus sts = runner.initialize(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.setup_simulation(&sd); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + + // Before running, no rays have been launched yet + EXPECT_EQ(runner.get_number_rays_launched(), static_cast(0)); + + sts = runner.run_simulation(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + + EXPECT_GE(runner.get_number_rays_launched(), NRAYS); + + const uint_fast64_t rays_traced = runner.get_number_rays_traced(); + EXPECT_GT(rays_traced, static_cast(0)); + EXPECT_LE(rays_traced, runner.get_number_rays_launched()); + EXPECT_EQ(rays_traced, NRAYS); +} From 41c613d35fa6133bed2cdab0bf5cc13d95a8f193 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 19 May 2026 10:39:02 -0600 Subject: [PATCH 39/60] Make geometry intersection testing for optix runner more robust --- .../geometry_intersection_test.cpp | 179 ++++++++++++++---- 1 file changed, 146 insertions(+), 33 deletions(-) diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp index 2ad27e61..21c90c84 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp @@ -8,12 +8,13 @@ using SolTrace::Runner::RunnerStatus; const double Z_ELEM = 50.0; +const double Z_BACKSTOP = Z_ELEM - 0.5 * Z_ELEM; const double TOL = 1e-6; const uint_fast64_t NRAYS = 10000; -void set_default_sd(SimulationData &sd, - surface_ptr surf, - aperture_ptr ap) +element_id set_default_sd(SimulationData &sd, + surface_ptr surf, + aperture_ptr ap) { sd.clear(); @@ -22,7 +23,7 @@ void set_default_sd(SimulationData &sd, sun->set_position(0, 0, 100); sd.add_ray_source(sun); - // Make reflective flat el + // Make target element element_ptr el = make_element(); el->set_origin(0, 0, Z_ELEM); el->set_aim_vector(0, 0, 100); // Face up towards sun @@ -36,7 +37,19 @@ void set_default_sd(SimulationData &sd, el->set_name("el"); // Add element to stage - sd.add_element(el); + element_id id = sd.add_element(el); + + // Back stop element that is bigger than the created element so that the + // testing element casts a shadow on this big thing. + stop = make_element(); + double xlb, xub, ylb, yub; + ap->bounding_box(xlb, xub, ylb, yub); + const double sx = std::max(fabs(xlb), fabs(xub)) + 1.0; + const double sy = std::max(fabs(ylb), fabs(yub)) + 1.0; + stop->set_origin(0, 0, Z_BACKSTOP); + stop->set_aim_vector(0, 0, 100); + stop->set_surface(make_surface()); + stop->set_aperture(make_aperture(sx, sy)); // Set parameters SimulationParameters ¶ms = sd.get_simulation_parameters(); @@ -45,6 +58,8 @@ void set_default_sd(SimulationData &sd, params.include_optical_errors = false; params.include_sun_shape_errors = false; params.seed = 123; + + return id; } TEST(OptixRunner, FlatRectangle) @@ -55,7 +70,7 @@ TEST(OptixRunner, FlatRectangle) auto aper = make_aperture(XL, YL); SimulationData sd; - set_default_sd(sd, surf, aper); + element_id test_elid = (sd, surf, aper); SimulationResult result; OptixRunner runner; @@ -77,9 +92,24 @@ TEST(OptixRunner, FlatRectangle) glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); + auto id = rr->get_element(1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; - EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + + if (id == test_elid) + { + // We hit the test element. Check that the height is as expected + EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + // And that we are in the aperture + EXPECT_TRUE(aper->is_in(p1[0], p1[1])); + } + else + { + // We hit the back stop element. Check that the height is as expected + EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM); + // And that we are not in the aperture. + EXPECT_FALSE(aper->is_in(p1[0], p1[1])); + } } } @@ -90,7 +120,7 @@ TEST(OptixRunner, FlatEquilateralTriangle) auto aper = make_aperture(d); SimulationData sd; - set_default_sd(sd, surf, aper); + element_id test_elid = set_default_sd(sd, surf, aper); SimulationResult result; OptixRunner runner; @@ -112,9 +142,20 @@ TEST(OptixRunner, FlatEquilateralTriangle) glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); + auto id = rr->get_element(1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; - EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + + if (id == test_elid) + { + EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + EXPECT_TRUE(aper->is_in(p1[0], p1[1])); + } + else + { + EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM); + EXPECT_FALSE(aper->is_in(p1[0], p1[1])); + } } } @@ -126,7 +167,7 @@ TEST(OptixRunner, FlatTriangle) auto aper = make_aperture(x1, y1, x2, y2, x3, y3); SimulationData sd; - set_default_sd(sd, surf, aper); + element_id test_elid = set_default_sd(sd, surf, aper); SimulationResult result; OptixRunner runner; @@ -148,9 +189,20 @@ TEST(OptixRunner, FlatTriangle) glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); + auto id = rr->get_element(1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; - EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + + if (id == test_elid) + { + EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + EXPECT_TRUE(aper->is_in(p1[0], p1[1])); + } + else + { + EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM); + EXPECT_FALSE(aper->is_in(p1[0], p1[1])); + } } } @@ -164,7 +216,7 @@ TEST(OptixRunner, FlatQuadrilateral) x1, y1, x2, y2, x3, y3, x4, y4); SimulationData sd; - set_default_sd(sd, surf, aper); + element_id test_elid = set_default_sd(sd, surf, aper); SimulationResult result; OptixRunner runner; @@ -186,9 +238,20 @@ TEST(OptixRunner, FlatQuadrilateral) glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); + auto id = rr->get_element(1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; - EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + + if (id == test_elid) + { + EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + EXPECT_TRUE(aper->is_in(p1[0], p1[1])); + } + else + { + EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM); + EXPECT_FALSE(aper->is_in(p1[0], p1[1])); + } } } @@ -203,7 +266,7 @@ TEST(OptixRunner, ParabolaRectangle) auto aper = make_aperture(XL, YL); SimulationData sd; - set_default_sd(sd, surf, aper); + element_id test_elid = set_default_sd(sd, surf, aper); SimulationResult result; OptixRunner runner; @@ -225,10 +288,21 @@ TEST(OptixRunner, ParabolaRectangle) glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); + auto id = rr->get_element(1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; - const double z1 = Z_ELEM + 0.5 * CX * p1[0] * p1[0] + 0.5 * CY * p1[1] * p1[1]; - EXPECT_NEAR(p1[2], z1, TOL * Z_ELEM) << "ray " << i; + + if (id == test_elid) + { + const double z1 = Z_ELEM + 0.5 * CX * p1[0] * p1[0] + 0.5 * CY * p1[1] * p1[1]; + EXPECT_NEAR(p1[2], z1, TOL * Z_ELEM) << "ray " << i; + EXPECT_TRUE(aper->is_in(p1[0], p1[1])); + } + else + { + EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM); + EXPECT_FALSE(aper->is_in(p1[0], p1[1])); + } } } @@ -240,7 +314,7 @@ TEST(OptixRunner, Cylinder) auto aper = make_aperture(2 * R, YL); SimulationData sd; - set_default_sd(sd, surf, aper); + element_id test_elid = set_default_sd(sd, surf, aper); SimulationResult result; OptixRunner runner; @@ -262,10 +336,21 @@ TEST(OptixRunner, Cylinder) glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); + auto id = rr->get_element(1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; - const double z1 = Z_ELEM + sqrt(R * R - p1[0] * p1[0]); - EXPECT_NEAR(p1[2], z1, TOL * Z_ELEM) << "ray " << i; + + if (id == test_elid) + { + const double z1 = Z_ELEM + sqrt(R * R - p1[0] * p1[0]); + EXPECT_NEAR(p1[2], z1, TOL * Z_ELEM) << "ray " << i; + EXPECT_TRUE(aper->is_in(p1[0], p1[1])); + } + else + { + EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM); + EXPECT_FALSE(aper->is_in(p1[0], p1[1])); + } } } @@ -273,10 +358,10 @@ TEST(OptixRunner, FlatCircle) { const double R = 5.0; auto surf = make_surface(); - auto aper = make_aperture(2*R); + auto aper = make_aperture(2 * R); SimulationData sd; - set_default_sd(sd, surf, aper); + element_id test_elid = set_default_sd(sd, surf, aper); SimulationResult result; OptixRunner runner; @@ -295,13 +380,23 @@ TEST(OptixRunner, FlatCircle) { auto rr = result[i]; ASSERT_GE(rr->get_number_of_interactions(), 2); - glm::dvec3 p0, p1; + glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); + auto id = rr->get_element(1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; - EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; - EXPECT_LE(sqrt(p1[0]*p1[0] + p1[1]*p1[1]), R); + + if (id == test_elid) + { + EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + EXPECT_TRUE(aper->is_in(p1[0], p1[1])); + } + else + { + EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM); + EXPECT_FALSE(aper->is_in(p1[0], p1[1])); + } } } @@ -309,10 +404,10 @@ TEST(OptixRunner, FlatHexagon) { const double S = 5.0; auto surf = make_surface(); - auto aper = make_aperture(2*S); + auto aper = make_aperture(2 * S); SimulationData sd; - set_default_sd(sd, surf, aper); + element_id test_elid = set_default_sd(sd, surf, aper); SimulationResult result; OptixRunner runner; @@ -334,10 +429,20 @@ TEST(OptixRunner, FlatHexagon) glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); + auto id = rr->get_element(1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; - EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; - EXPECT_TRUE(aper->is_in(p1[0], p1[1])); + + if (id == test_elid) + { + EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + EXPECT_TRUE(aper->is_in(p1[0], p1[1])); + } + else + { + EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM); + EXPECT_FALSE(aper->is_in(p1[0], p1[1])); + } } } @@ -350,7 +455,7 @@ TEST(OptixRunner, FlatAnnulus) auto aper = make_aperture(R0, R1, ARC); SimulationData sd; - set_default_sd(sd, surf, aper); + element_id test_elid = set_default_sd(sd, surf, aper); SimulationResult result; OptixRunner runner; @@ -372,11 +477,19 @@ TEST(OptixRunner, FlatAnnulus) glm::dvec3 p0, p1; rr->get_position(0, p0); rr->get_position(1, p1); + auto id = rr->get_element(1); EXPECT_NEAR(p0[0], p1[0], TOL) << "ray " << i; EXPECT_NEAR(p0[1], p1[1], TOL) << "ray " << i; - EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; - double r = sqrt(p1[0]*p1[0] + p1[1]*p1[1]); - EXPECT_GE(r, R0); - EXPECT_LE(r, R1); + + if (id == test_elid) + { + EXPECT_NEAR(p1[2], Z_ELEM, TOL * Z_ELEM) << "ray " << i; + EXPECT_TRUE(aper->is_in(p1[0], p1[1])); + } + else + { + EXPECT_NEAR(p1[2], Z_BACKSTOP, TOL * Z_ELEM); + EXPECT_FALSE(aper->is_in(p1[0], p1[1])); + } } } From e99b9d9023a7860785f75c99d4d41d3f9e56d9e5 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 19 May 2026 10:42:19 -0600 Subject: [PATCH 40/60] Fix dumb errors --- .../optix_runner/geometry_intersection_test.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp index 21c90c84..4e537927 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp @@ -50,6 +50,7 @@ element_id set_default_sd(SimulationData &sd, stop->set_aim_vector(0, 0, 100); stop->set_surface(make_surface()); stop->set_aperture(make_aperture(sx, sy)); + sd.add_element(stop); // Set parameters SimulationParameters ¶ms = sd.get_simulation_parameters(); @@ -70,7 +71,7 @@ TEST(OptixRunner, FlatRectangle) auto aper = make_aperture(XL, YL); SimulationData sd; - element_id test_elid = (sd, surf, aper); + element_id test_elid = set_default_sd(sd, surf, aper); SimulationResult result; OptixRunner runner; From 6808cd6579f72ff64f63eef6e96b4887bc536edf Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 19 May 2026 10:47:34 -0600 Subject: [PATCH 41/60] Another dumb error... --- .../optix_runner/geometry_intersection_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp index 4e537927..b1dd3f36 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/geometry_intersection_test.cpp @@ -41,7 +41,7 @@ element_id set_default_sd(SimulationData &sd, // Back stop element that is bigger than the created element so that the // testing element casts a shadow on this big thing. - stop = make_element(); + element_ptr stop = make_element(); double xlb, xub, ylb, yub; ap->bounding_box(xlb, xub, ylb, yub); const double sx = std::max(fabs(xlb), fabs(xub)) + 1.0; From e1cddd7906d9bfd2240966dba9e52b444d9d155b Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 19 May 2026 10:49:19 -0600 Subject: [PATCH 42/60] Fix hexagon aperture mistake --- .../optix_runner/OptixCSP/src/shaders/intersection.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu index 0f30211d..b9688d39 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/intersection.cu @@ -669,7 +669,7 @@ extern "C" __global__ void __intersection__hexagon_flat() // Left side float y1 = sqrtf(3.0f) * (p.x + s); float y2 = -y1; - if (y1 <= p.y && p.y <= y2) + if (y2 <= p.y && p.y <= y1) { is_in = true; } @@ -679,7 +679,7 @@ extern "C" __global__ void __intersection__hexagon_flat() // Right side float y1 = sqrtf(3.0f) * (p.x - s); float y2 = -y1; - if (y2 <= p.y && p.y <= y1) + if (y1 <= p.y && p.y <= y2) { is_in = true; } From 8b2d78fbff5a98cb651f0471de12d83715350e26 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 19 May 2026 14:34:36 -0600 Subject: [PATCH 43/60] Address copilot review comments --- coretrace/CMakeLists.txt | 2 +- coretrace/simdriver/main.cpp | 19 ++++++++++++------- .../embree_runner/ftz_daz.hpp | 7 ++++--- .../OptixCSP/src/core/soltrace_system.cpp | 2 ++ .../OptixCSP/src/shaders/Soltrace.h | 1 - .../optix_runner/optix_runner.cpp | 11 +++++++++++ .../optix_runner/optix_runner.hpp | 4 ++++ 7 files changed, 34 insertions(+), 12 deletions(-) diff --git a/coretrace/CMakeLists.txt b/coretrace/CMakeLists.txt index 63deb273..85e6c02c 100644 --- a/coretrace/CMakeLists.txt +++ b/coretrace/CMakeLists.txt @@ -161,7 +161,7 @@ if(SOLTRACE_BUILD_CORETRACE) PRIVATE ${embree_INCLUDE_DIRS} ) - target_link_libraries(coretrace PRIVATE simdata native_runner simresult embree_runner ${embree_LIBRARIES}) + target_link_libraries(coretrace PRIVATE simdata native_runner optix_runner simresult embree_runner ${embree_LIBRARIES}) ##################################################################################################################### # diff --git a/coretrace/simdriver/main.cpp b/coretrace/simdriver/main.cpp index b1cd360d..e99d4e32 100644 --- a/coretrace/simdriver/main.cpp +++ b/coretrace/simdriver/main.cpp @@ -92,25 +92,30 @@ int main(int argc, char *argv[]) } const bool file_optional = skip_output || skip_csv; - if (!file_optional && argc < 3) + + const std::string input_file = argv[1]; + + // argv[2], if present and not a flag (does not start with --), is treated as + // the output file path. This allows the user to supply an output path even + // when --no-output or --no-csv is also present without it being mis-parsed + // as an unknown option. + const bool has_output_arg = (argc >= 3) && (std::string(argv[2]).rfind("--", 0) != 0); + const std::string output_file = has_output_arg ? argv[2] : ""; + const int opts_start = has_output_arg ? 3 : 2; + + if (!file_optional && !has_output_arg) { std::cerr << "Error: output file is required unless --no-output or --no-csv is specified\n"; print_usage(argv[0]); return EXIT_FAILURE; } - const std::string input_file = argv[1]; - // output_file is only meaningful when neither skip_output nor skip_csv is set - const std::string output_file = (!file_optional && argc >= 3) ? argv[2] : ""; - int num_threads = 1; long long num_rays_override = -1; // -1 means use what the JSON specifies bool use_embree = false; bool use_optix = false; bool verbose = false; - // Start parsing options from argv[2] if output file is omitted, else from argv[3] - const int opts_start = file_optional ? 2 : 3; for (int i = opts_start; i < argc; ++i) { const std::string arg = argv[i]; diff --git a/coretrace/simulation_runner/embree_runner/ftz_daz.hpp b/coretrace/simulation_runner/embree_runner/ftz_daz.hpp index 341ff7c0..b6b5767e 100644 --- a/coretrace/simulation_runner/embree_runner/ftz_daz.hpp +++ b/coretrace/simulation_runner/embree_runner/ftz_daz.hpp @@ -8,10 +8,11 @@ #if defined(__SSE__) || defined(_M_X64) || defined(_M_IX86) # include -# include +// Set FTZ (bit 15, 0x8000) and DAZ (bit 6, 0x0040) via MXCSR. +// is sufficient; (SSE3) is intentionally avoided +// so this compiles on SSE-only targets (-msse without -msse3). # define SOLTRACE_SET_FTZ_DAZ() \ - _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \ - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON) + _mm_setcsr(_mm_getcsr() | 0x8040u) #elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) # if defined(_MSC_VER) # include diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 43e2d951..887f1fe5 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -277,6 +277,7 @@ void SolTraceSystem::run() { // Initialize results m_hit_records.clear(); + m_hit_ray_ids.clear(); m_n_hit_rays = 0; m_n_sun_rays = 0; uint_fast64_t N_ray_hit = 0; @@ -345,6 +346,7 @@ void SolTraceSystem::run() m_n_sun_rays = N_ray_gen; } + // TODO: Add option to turn on/off trimming rays... // Trim excess rays: remove ray groups from the tail until m_n_hit_rays == m_number_of_rays. // Each group starts at the last HIT_CREATE record in m_hit_records. while (m_n_hit_rays > m_number_of_rays && !m_hit_records.empty()) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h index fb9682f7..299a847f 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h @@ -26,7 +26,6 @@ namespace OptixCSP{ float4 hit_point; int32_t element_id; uint8_t hit_type; - uint8_t _pad[3]; // TODO: Is this necessary? }; enum RayType diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index dfbb277a..5a400fea 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -86,6 +86,17 @@ RunnerStatus OptixRunner::setup_parameters(const SimulationData *data) { // Get Parameter data const SimulationParameters &sim_params = data->get_simulation_parameters(); + + // ray_offset and per-ray ids are stored as uint32_t in the compaction path. + // Reject runs that would overflow before they start rather than silently + // wrapping and corrupting trimming or reported launched-ray counts. + if (sim_params.max_number_of_rays > static_cast(std::numeric_limits::max())) + { + throw std::overflow_error( + "max_number_of_rays exceeds UINT32_MAX; the OptiX runner stores " + "ray offsets and ids as uint32_t and cannot represent this run."); + } + m_sys.set_number_of_rays(sim_params.number_of_rays, sim_params.max_number_of_rays); m_sys.set_seed(static_cast(sim_params.seed)); diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.hpp b/coretrace/simulation_runner/optix_runner/optix_runner.hpp index 8920244a..40ddc584 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.hpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.hpp @@ -42,6 +42,10 @@ class OptixRunner : public SolTrace::Runner::SimulationRunner void set_verbose(bool verbose); + // Set the number of rays to launch for a trace in each optixLaunch call. + // WARNING: The runner is forced to use this batch size regardless of available GPU memory!!!! + // Setting a large batch size can cause device out of memory errors or degraded GPU performance. + // Setting a small batch size can cause long run times. Care is required when using this function. void set_batch_size(uint_fast64_t batch_size); uint_fast64_t get_batch_size() const; From 4e7247ac2535e9eafc0304deefca52258ada5af6 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 19 May 2026 14:41:44 -0600 Subject: [PATCH 44/60] Add option to enable/disable OptixRunner ray trimming; added test for it as well --- .../OptixCSP/src/core/soltrace_system.cpp | 3 +- .../OptixCSP/src/core/soltrace_system.h | 6 +++ .../optix_runner/optix_runner.cpp | 10 +++++ .../optix_runner/optix_runner.hpp | 5 +++ .../optix_runner/two_plate_test.cpp | 39 ++++++++++++++++++- 5 files changed, 60 insertions(+), 3 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 887f1fe5..3dc26647 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -346,10 +346,9 @@ void SolTraceSystem::run() m_n_sun_rays = N_ray_gen; } - // TODO: Add option to turn on/off trimming rays... // Trim excess rays: remove ray groups from the tail until m_n_hit_rays == m_number_of_rays. // Each group starts at the last HIT_CREATE record in m_hit_records. - while (m_n_hit_rays > m_number_of_rays && !m_hit_records.empty()) + while (m_trim_excess_rays && m_n_hit_rays > m_number_of_rays && !m_hit_records.empty()) { // Walk backwards to find the last CREATE record auto rit = std::find_if(m_hit_records.rbegin(), m_hit_records.rend(), diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index 63da2aa5..d588dc5a 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -126,6 +126,11 @@ namespace OptixCSP uint_fast64_t get_N_hit_rays() const { return m_n_hit_rays; } void set_sun_shape_errors(bool flag) { this->m_include_sun_shape_errors = flag; } + /// Enable or disable trimming excess rays at the end of run() so that + /// exactly m_number_of_rays hit rays are returned. Enabled by default. + void set_trim_excess_rays(bool trim) { m_trim_excess_rays = trim; } + bool get_trim_excess_rays() const { return m_trim_excess_rays; } + private: std::shared_ptr geometry_manager; std::shared_ptr pipeline_manager; @@ -143,6 +148,7 @@ namespace OptixCSP SolTrace::Data::Sun *m_sun; bool m_include_sun_shape_errors = false; + bool m_trim_excess_rays = true; uint64_t m_seed = 123456ULL; bool m_optical_errors; diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index 5a400fea..bb53636b 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -40,6 +40,16 @@ uint_fast64_t OptixRunner::get_batch_size() const return m_sys.get_batch_size(); } +void OptixRunner::set_trim_excess_rays(bool trim) +{ + m_sys.set_trim_excess_rays(trim); +} + +bool OptixRunner::get_trim_excess_rays() const +{ + return m_sys.get_trim_excess_rays(); +} + uint64_t OptixRunner::get_N_run_iterations() const { return m_sys.get_N_run_iterations(); diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.hpp b/coretrace/simulation_runner/optix_runner/optix_runner.hpp index 40ddc584..025cec37 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.hpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.hpp @@ -49,6 +49,11 @@ class OptixRunner : public SolTrace::Runner::SimulationRunner void set_batch_size(uint_fast64_t batch_size); uint_fast64_t get_batch_size() const; + /// Enable or disable trimming of excess rays at the end of run() so that + /// exactly the requested number of hit rays is returned. Enabled by default. + void set_trim_excess_rays(bool trim); + bool get_trim_excess_rays() const; + // Runner options // void disable_sun_shape_errors() { this->include_sun_shape_errors = false; } // void enable_sun_shape_errors() { this->include_sun_shape_errors = true; } diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp index b46c447c..b88f5916 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/two_plate_test.cpp @@ -245,4 +245,41 @@ TEST(TwoPlateOptix, SimResults) } } -} \ No newline at end of file +} + +TEST(TwoPlateOptix, TrimExcessRaysOption) +{ + SimulationData sd; + element_ptr plate1, plate2; + make_two_plate_sd(sd, plate1, plate2); + const int n_rays = sd.get_simulation_parameters().number_of_rays; + + // Default: trim enabled — result has exactly n_rays records + { + OptixRunner runner; + EXPECT_TRUE(runner.get_trim_excess_rays()); // default is true + + ASSERT_EQ(runner.initialize(), RunnerStatus::SUCCESS); + ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS); + ASSERT_EQ(runner.run_simulation(), RunnerStatus::SUCCESS); + + SimulationResult result; + ASSERT_EQ(runner.report_simulation(&result, 0), RunnerStatus::SUCCESS); + EXPECT_EQ(result.get_number_of_records(), n_rays); + } + + // Trim disabled — result has at least n_rays records (batch overshoot is not removed) + { + OptixRunner runner; + runner.set_trim_excess_rays(false); + EXPECT_FALSE(runner.get_trim_excess_rays()); + + ASSERT_EQ(runner.initialize(), RunnerStatus::SUCCESS); + ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS); + ASSERT_EQ(runner.run_simulation(), RunnerStatus::SUCCESS); + + SimulationResult result; + ASSERT_EQ(runner.report_simulation(&result, 0), RunnerStatus::SUCCESS); + EXPECT_GE(result.get_number_of_records(), n_rays); + } +} From 08f6f8c7a26bbb77e5354588f578366192642dcb Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 19 May 2026 14:54:00 -0600 Subject: [PATCH 45/60] Fix some memory leaks in optix runner --- .../OptixCSP/src/core/data_manager.cpp | 20 +++++++++++++++++++ .../OptixCSP/src/core/soltrace_system.cpp | 11 ++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp index bbe1fd7f..6e0e46f0 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp @@ -71,6 +71,11 @@ OptixCSP::LaunchParams *dataManager::getDeviceLaunchParams() const { return laun void dataManager::allocateLaunchParams() { + if (launch_params_D) + { + CUDA_CHECK(cudaFree(launch_params_D)); + launch_params_D = nullptr; + } CUDA_CHECK(cudaMalloc(reinterpret_cast(&launch_params_D), sizeof(LaunchParams))); } @@ -109,6 +114,11 @@ void dataManager::ensureCurandStates( void dataManager::allocateGeometryDataArray(std::vector geometry_data_array_H) { + if (geometry_data_array_D) + { + CUDA_CHECK(cudaFree(geometry_data_array_D)); + geometry_data_array_D = nullptr; + } CUDA_CHECK(cudaMalloc(reinterpret_cast(&geometry_data_array_D), geometry_data_array_H.size() * sizeof(GeometryDataST))); @@ -136,6 +146,16 @@ void dataManager::updateGeometryDataArray(std::vector geometry_d void dataManager::allocateMaterialDataArray(std::vector material_data_array_front_H, std::vector material_data_array_back_H) { + if (material_data_array_front_D) + { + CUDA_CHECK(cudaFree(material_data_array_front_D)); + material_data_array_front_D = nullptr; + } + if (material_data_array_back_D) + { + CUDA_CHECK(cudaFree(material_data_array_back_D)); + material_data_array_back_D = nullptr; + } CUDA_CHECK(cudaMalloc(reinterpret_cast(&material_data_array_front_D), material_data_array_front_H.size() * sizeof(MaterialData))); diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 3dc26647..692748f4 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -86,6 +86,7 @@ SolTraceSystem::SolTraceSystem() SolTraceSystem::~SolTraceSystem() { + clean_up(); } void SolTraceSystem::set_verbose(bool verbose) @@ -250,8 +251,9 @@ void SolTraceSystem::initialize() data_manager->launch_params_H.sun_dir_seed = m_seed; data_manager->launch_params_H.optical_errors = m_optical_errors; - // Create a CUDA stream for asynchronous operations. - CUDA_CHECK(cudaStreamCreate(&m_state.stream)); + // Create a CUDA stream for asynchronous operations (once; guard against re-init leak). + if (!m_state.stream) + CUDA_CHECK(cudaStreamCreate(&m_state.stream)); // Link the GAS handle. data_manager->launch_params_H.handle = m_state.gas_handle; @@ -511,6 +513,11 @@ void SolTraceSystem::reset() // with their corresponding programs (ray generation, miss, and hit group). void SolTraceSystem::create_shader_binding_table() { + // Free any previously allocated SBT records to avoid leaks on re-initialization. + CUDA_CHECK(cudaFree(reinterpret_cast(m_state.sbt.raygenRecord))); + CUDA_CHECK(cudaFree(reinterpret_cast(m_state.sbt.missRecordBase))); + CUDA_CHECK(cudaFree(reinterpret_cast(m_state.sbt.hitgroupRecordBase))); + m_state.sbt = {}; // Ray generation program record { From ae78f0f2e4f375af2f1713844649167f478f7e59 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 19 May 2026 15:25:07 -0600 Subject: [PATCH 46/60] Make automatic batch sizing more stable on multiple run calls --- .../OptixCSP/src/core/soltrace_system.cpp | 51 ++++++++++++++++--- .../OptixCSP/src/core/soltrace_system.h | 7 ++- 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 692748f4..97dd9e6f 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -53,6 +53,7 @@ SolTraceSystem::SolTraceSystem() m_timer_optix_launch(), m_timer_collect_results(), m_n_run_iterations(0), + m_mem_free_post_setup(0), geometry_manager(std::make_shared(m_state, m_verbose)), data_manager(std::make_shared()), pipeline_manager(std::make_shared(m_state)), @@ -146,8 +147,10 @@ void SolTraceSystem::initialize() OPTIX_CHECK(optixDeviceContextCreate(cuCtx, &options, &m_state.context)); } - size_t mem_total; - cudaMemGetInfo(&m_mem_free_before, &mem_total); + { + size_t mem_total; + CUDA_CHECK(cudaMemGetInfo(&m_mem_free_before, &mem_total)); + } m_timer_setup.start(); // set up input related to sun @@ -272,6 +275,17 @@ void SolTraceSystem::initialize() } data_manager->allocateLaunchParams(); + + // Snapshot free GPU memory now that all setup allocations (BVH, pipeline, + // SBT, geometry/material arrays, launch params) are complete but before any + // ray buffers exist. automatic_batch_size() uses this as a stable baseline + // so that batch sizing is consistent across every run() call. + // Memory used by setup = m_mem_free_before - m_mem_free_post_setup. + { + size_t mem_total; + CUDA_CHECK(cudaMemGetInfo(&m_mem_free_post_setup, &mem_total)); + } + m_timer_setup.stop(); } @@ -313,7 +327,6 @@ void SolTraceSystem::run() int width = data_manager->launch_params_H.width; int height = data_manager->launch_params_H.height; - size_t m_mem_free_after; size_t mem_total; cudaMemGetInfo(&m_mem_free_after, &mem_total); @@ -491,6 +504,10 @@ void SolTraceSystem::clean_up() m_state.gas_handle = 0; m_state.sbt = {}; m_state.d_gas_output_buffer = 0; + + m_mem_free_before = 0; + m_mem_free_post_setup = 0; + m_mem_free_after = 0; } void SolTraceSystem::reset() @@ -760,6 +777,25 @@ void SolTraceSystem::print_timing() const std::cout << "\n--- Grand Total ---\n"; std::cout << " Setup + Trace : " << (t_setup + t_trace) << " s\n"; + + std::cout << "\n--- GPU Memory Usage ---\n"; + constexpr double kMB = 1.0 / (1024.0 * 1024.0); + if (m_mem_free_before > 0) + { + std::cout << std::fixed << std::setprecision(2); + std::cout << " Free before setup : " << m_mem_free_before * kMB << " MB\n"; + if (m_mem_free_post_setup > 0) + { + std::cout << " Free after setup : " << m_mem_free_post_setup * kMB << " MB\n"; + std::cout << " Setup structures : " << (m_mem_free_before - m_mem_free_post_setup) * kMB << " MB\n"; + if (m_mem_free_after > 0) + { + std::cout << " Ray buffers : " << (m_mem_free_post_setup - m_mem_free_after) * kMB << " MB\n"; + std::cout << " Total used : " << (m_mem_free_before - m_mem_free_after) * kMB << " MB\n"; + } + } + std::cout << std::fixed << std::setprecision(6); + } std::cout << "=====================================\n"; } @@ -778,10 +814,11 @@ double SolTraceSystem::get_sun_plane_area() const uint_fast64_t SolTraceSystem::automatic_batch_size() const { - // Query free GPU memory *after* the BVH and pipeline have been built so - // that only the ray-data buffers need to fit in the remaining space. - size_t mem_free, mem_total; - CUDA_CHECK(cudaMemGetInfo(&mem_free, &mem_total)); + // Use the free-memory snapshot taken at the end of initialize(), after all + // setup allocations (BVH, pipeline, SBT, etc.) but before any ray buffers. + // This gives a stable baseline that does not shrink on subsequent run() calls + // due to the already-allocated (and reused) ray buffers being counted as used. + const size_t mem_free = m_mem_free_post_setup; // Reserve 20 % headroom for OptiX internal allocations, memory // fragmentation, and any other transient allocations during launch. diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index d588dc5a..90ee08bd 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -212,7 +212,10 @@ namespace OptixCSP uint64_t m_n_run_iterations; // memory usage - size_t m_mem_free_before; - size_t m_mem_free_after; + size_t m_mem_free_before; ///< Free GPU memory at the start of initialize(), before any setup allocations. + size_t m_mem_free_post_setup; ///< Free GPU memory at the end of initialize(), after all setup allocations (BVH, + /// pipeline, SBT, geometry/material arrays). Used as the baseline in + /// automatic_batch_size() so batch sizing is stable across run() calls. + size_t m_mem_free_after; ///< Free GPU memory sampled during run() for per-launch memory reporting. }; } From 1529f5bba9f496ba2fc769b118bd5e7b701c788c Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 19 May 2026 15:34:03 -0600 Subject: [PATCH 47/60] Remove outdated todos --- .../optix_runner/OptixCSP/src/shaders/sun.cu | 6 ------ coretrace/simulation_runner/optix_runner/optix_runner.cpp | 5 ----- 2 files changed, 11 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu index 853b53bf..6805c336 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu @@ -2,8 +2,6 @@ #include #include -// todo: move curand initializatin to global function - //#include //#include #include "Soltrace.h" @@ -366,10 +364,6 @@ extern "C" __global__ void __raygen__sun_source() prd.ray_path_index = ray_number; prd.depth = 0; - // TODO make this a launch parameter - // params.hit_point_buffer[params.max_depth * prd.ray_path_index] = make_float4(0.0f, ray_gen_pos); - // params.element_id_buffer[params.max_depth * prd.ray_path_index] = OptixCSP::kElementIdRayGen; - // params.hit_type_buffer[params.max_depth * prd.ray_path_index] = OptixCSP::HitType::HIT_CREATE; params.hit_buffer[params.max_depth * prd.ray_path_index].hit_point = make_float4(0.0f, ray_gen_pos); params.hit_buffer[params.max_depth * prd.ray_path_index].element_id = OptixCSP::kElementIdRayGen; params.hit_buffer[params.max_depth * prd.ray_path_index].hit_type = OptixCSP::HitType::HIT_CREATE; diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index bb53636b..e9fd118b 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -182,8 +182,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) } optix_el->set_id(static_cast(id)); - // TODO: check zrot, radiance or degree here? - // Add optical properties OpticalProperties *opt_front = el->get_front_optical_properties(); OptixCSP::OpticalDistribution od = this->to_optical_distribution(opt_front->error_distribution_type); @@ -260,7 +258,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) } auto surface = std::make_shared(); - // surface->set_half_height(2.); // TODO this needs to come from the aperture surface->set_half_height(0.5 * el_aperture->y_length()); surface->set_radius(el_surface->radius); optix_el->set_surface(surface); @@ -391,8 +388,6 @@ RunnerStatus OptixRunner::setup_elements(const SimulationData *data) RunnerStatus OptixRunner::update_simulation(const SimulationData *data) { - // TODO: Need this call? - // this->m_sys.clean_up(); return this->setup_simulation(data); // TODO: Implement this in a less lazy manner... } From aa686a3f9f6cc23a6b50071986c1abf632f39602 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 20 May 2026 10:09:10 -0600 Subject: [PATCH 48/60] Move to 64-bit unsigned integers for global ray ids; more copilot review fixes --- .../embree_runner/ftz_daz.hpp | 2 + .../OptixCSP/src/core/ray_utils.cu | 38 ++++++++++--------- .../OptixCSP/src/core/ray_utils.h | 6 +-- .../OptixCSP/src/core/soltrace_system.cpp | 12 +++--- .../OptixCSP/src/core/soltrace_system.h | 2 +- .../OptixCSP/src/shaders/Soltrace.h | 2 +- .../optix_runner/OptixCSP/src/shaders/sun.cu | 2 +- .../optix_runner/optix_runner.cpp | 10 ----- 8 files changed, 34 insertions(+), 40 deletions(-) diff --git a/coretrace/simulation_runner/embree_runner/ftz_daz.hpp b/coretrace/simulation_runner/embree_runner/ftz_daz.hpp index b6b5767e..742d4b17 100644 --- a/coretrace/simulation_runner/embree_runner/ftz_daz.hpp +++ b/coretrace/simulation_runner/embree_runner/ftz_daz.hpp @@ -1,6 +1,8 @@ #ifndef SOLTRACE_FTZ_DAZ_HPP #define SOLTRACE_FTZ_DAZ_HPP +#include + // Set Flush-to-Zero (FTZ) and Denormals-are-Zero (DAZ) floating-point flags // for the calling thread. These are thread-local CPU register settings that // avoid slow denormal handling in the FPU, as recommended by the Embree docs. diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu index 284a1c2a..661bb07a 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu @@ -45,9 +45,9 @@ namespace OptixCSP break; } - const uint32_t has_hit = (raw_count > 1) ? 1u : 0u; + const uint8_t has_hit = (raw_count > 1) ? 1u : 0u; out_record_count[ray] = has_hit ? raw_count : 0u; - out_has_hit[ray] = static_cast(has_hit); + out_has_hit[ray] = has_hit; return; } @@ -64,7 +64,7 @@ namespace OptixCSP const HitRecord *__restrict__ hit_buffer, uint32_t num_rays, uint32_t max_depth, - const uint32_t *__restrict__ offsets, + const uint64_t *__restrict__ offsets, const uint8_t *__restrict__ has_hit, HitRecord *__restrict__ out_buffer) { @@ -73,7 +73,7 @@ namespace OptixCSP return; const HitRecord *ray_base = hit_buffer + max_depth * ray; - uint32_t out_idx = offsets[ray]; + uint64_t out_idx = offsets[ray]; // Depth 0 is always HIT_CREATE for qualifying rays out_buffer[out_idx++] = ray_base[0]; @@ -102,7 +102,7 @@ namespace OptixCSP free_compaction_scratch(scratch); CUDA_CHECK(cudaMalloc(&scratch.d_count, num_rays * sizeof(uint32_t))); - CUDA_CHECK(cudaMalloc(&scratch.d_offsets, num_rays * sizeof(uint32_t))); + CUDA_CHECK(cudaMalloc(&scratch.d_offsets, num_rays * sizeof(uint64_t))); CUDA_CHECK(cudaMalloc(&scratch.d_has_hit, num_rays * sizeof(uint8_t))); CUDA_CHECK(cudaMalloc(&scratch.d_n_hit, sizeof(uint32_t))); @@ -110,12 +110,13 @@ namespace OptixCSP // scan_bytes must cover both ExclusiveSum and DeviceSelect::Flagged (d_scan_tmp is reused). uint32_t *null_u32 = nullptr; uint8_t *null_u8 = nullptr; - cub::DeviceScan::ExclusiveSum(scratch.d_scan_tmp, scratch.scan_bytes, null_u32, null_u32, num_rays); + uint64_t *null_u64 = nullptr; + cub::DeviceScan::ExclusiveSum(scratch.d_scan_tmp, scratch.scan_bytes, null_u32, null_u64, num_rays); cub::DeviceReduce::Sum(scratch.d_red_tmp, scratch.red_bytes, null_u8, null_u32, num_rays); size_t select_bytes = 0; - thrust::counting_iterator count_iter(0u); - cub::DeviceSelect::Flagged(nullptr, select_bytes, count_iter, null_u8, null_u32, null_u32, num_rays); + thrust::counting_iterator count_iter(0ull); + cub::DeviceSelect::Flagged(nullptr, select_bytes, count_iter, null_u8, null_u64, null_u32, num_rays); if (select_bytes > scratch.scan_bytes) scratch.scan_bytes = select_bytes; @@ -157,9 +158,9 @@ namespace OptixCSP const HitRecord *d_hit_buffer, uint32_t num_rays, uint32_t max_depth, - uint32_t ray_offset, + uint64_t ray_offset, std::vector &host_out, - std::vector &host_ray_ids, + std::vector &host_ray_ids, cudaStream_t stream, CompactionScratch &scratch, CompactionTimings *timings) @@ -198,8 +199,9 @@ namespace OptixCSP std::chrono::high_resolution_clock::time_point t_scalar; if (timings) t_scalar = std::chrono::high_resolution_clock::now(); - uint32_t last_offset = 0, last_count = 0, n_hit_rays = 0; - CUDA_CHECK(cudaMemcpy(&last_offset, scratch.d_offsets + (num_rays - 1), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + uint64_t last_offset = 0; + uint32_t last_count = 0, n_hit_rays = 0; + CUDA_CHECK(cudaMemcpy(&last_offset, scratch.d_offsets + (num_rays - 1), sizeof(uint64_t), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(&last_count, scratch.d_count + (num_rays - 1), sizeof(uint32_t), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(&n_hit_rays, scratch.d_n_hit, sizeof(uint32_t), cudaMemcpyDeviceToHost)); @@ -207,7 +209,7 @@ namespace OptixCSP timings->scalar_dth_ms += std::chrono::duration( std::chrono::high_resolution_clock::now() - t_scalar).count(); - const uint32_t total_records = last_offset + last_count; + const uint64_t total_records = last_offset + last_count; if (total_records > 0) { @@ -217,14 +219,14 @@ namespace OptixCSP compact_ray_outputs<<>>( d_hit_buffer, num_rays, max_depth, scratch.d_offsets, scratch.d_has_hit, scratch.d_compacted); - // ---- After Pass 2 d_offsets is free; reuse it to compact global ray IDs ---- + // ---- Compact global ray IDs into d_offsets (reused after pass 2) ---- // DeviceSelect::Flagged selects (ray_offset + i) for each i where d_has_hit[i] == 1. - // d_scan_tmp is also free (ExclusiveSum already completed). - thrust::counting_iterator ray_id_iter(ray_offset); + // d_scan_tmp and d_offsets are both free after compact_ray_outputs completes. + thrust::counting_iterator ray_id_iter(ray_offset); cub::DeviceSelect::Flagged( scratch.d_scan_tmp, scratch.scan_bytes, ray_id_iter, scratch.d_has_hit, - scratch.d_offsets, // output: global IDs of hit rays + scratch.d_offsets, // output: global IDs of hit rays (uint64_t) scratch.d_n_hit, // output count (already read; safe to overwrite) num_rays, stream); @@ -258,7 +260,7 @@ namespace OptixCSP CUDA_CHECK(cudaMemcpy( host_ray_ids.data() + prev_ids, scratch.d_offsets, - n_hit_rays * sizeof(uint32_t), + n_hit_rays * sizeof(uint64_t), cudaMemcpyDeviceToHost)); if (timings) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h index ed1a33c4..06ce8369 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h @@ -15,7 +15,7 @@ namespace OptixCSP struct CompactionScratch { uint32_t *d_count = nullptr; // per-ray output record count - uint32_t *d_offsets = nullptr; // exclusive prefix-sum of d_count + uint64_t *d_offsets = nullptr; // exclusive prefix-sum of d_count (pass 1); reused as global ray IDs (pass 2) uint8_t *d_has_hit = nullptr; // 1 if ray contributes records, else 0 uint32_t *d_n_hit = nullptr; // scalar: total hit rays void *d_scan_tmp = nullptr; // CUB DeviceScan temp storage @@ -59,9 +59,9 @@ namespace OptixCSP const HitRecord *d_hit_buffer, uint32_t num_rays, uint32_t max_depth, - uint32_t ray_offset, + uint64_t ray_offset, std::vector &host_out, - std::vector &host_ray_ids, + std::vector &host_ray_ids, cudaStream_t stream, CompactionScratch &scratch, CompactionTimings *timings = nullptr); diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 97dd9e6f..c4d45b0f 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -377,7 +377,7 @@ void SolTraceSystem::run() } // m_n_sun_rays = rays generated up to and including the last retained hit ray. if (!m_hit_ray_ids.empty()) - m_n_sun_rays = static_cast(m_hit_ray_ids.back()) + 1; + m_n_sun_rays = m_hit_ray_ids.back() + 1; m_timer_trace.stop(); @@ -409,7 +409,7 @@ void SolTraceSystem::run() void SolTraceSystem::update() { - const int N_slots = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth; + const size_t N_slots = static_cast(data_manager->launch_params_H.width) * static_cast(data_manager->launch_params_H.height) * static_cast(data_manager->launch_params_H.max_depth); const size_t hit_buffer_size = N_slots * sizeof(HitRecord); // update aabb and sun plane accordingly @@ -632,8 +632,8 @@ void SolTraceSystem::allocate_device_buffers() data_manager->launch_params_H.height = 1; data_manager->launch_params_H.max_depth = MAX_TRACE_DEPTH; - const size_t hit_buffer_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth * sizeof(HitRecord); - const size_t sun_dir_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(float3); + const size_t hit_buffer_size = static_cast(data_manager->launch_params_H.width) * static_cast(data_manager->launch_params_H.height) * static_cast(data_manager->launch_params_H.max_depth) * sizeof(HitRecord); + const size_t sun_dir_size = static_cast(data_manager->launch_params_H.width) * static_cast(data_manager->launch_params_H.height) * sizeof(float3); // NOTE: cudaFree is nullptr safe @@ -669,8 +669,8 @@ void SolTraceSystem::allocate_device_buffers() void SolTraceSystem::setup_device_buffer() { - const size_t hit_buffer_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * data_manager->launch_params_H.max_depth * sizeof(HitRecord); - const size_t sun_dir_size = data_manager->launch_params_H.width * data_manager->launch_params_H.height * sizeof(float3); + const size_t hit_buffer_size = static_cast(data_manager->launch_params_H.width) * static_cast(data_manager->launch_params_H.height) * static_cast(data_manager->launch_params_H.max_depth) * sizeof(HitRecord); + const size_t sun_dir_size = static_cast(data_manager->launch_params_H.width) * static_cast(data_manager->launch_params_H.height) * sizeof(float3); CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_buffer, 0, hit_buffer_size)); CUDA_CHECK(cudaMemset(data_manager->launch_params_H.sun_dir_buffer, 0, sun_dir_size)); diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index 90ee08bd..7c3be6d4 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -163,7 +163,7 @@ namespace OptixCSP // Global ray index (ray_offset + local_index) for each logical hit ray in m_hit_records. // Parallel to the logical rays (not records): m_hit_ray_ids.size() == m_n_hit_rays. - std::vector m_hit_ray_ids; + std::vector m_hit_ray_ids; // Count of rays that produced at least one non-CREATE hit. uint_fast64_t m_n_hit_rays = 0; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h index 299a847f..5f8e2e63 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h @@ -53,7 +53,7 @@ namespace OptixCSP{ unsigned int width; // essentially number of rays launched and sun points unsigned int height; int max_depth; - unsigned int ray_offset; // Global offset for current branch + unsigned long long ray_offset; // Global offset for current branch // float4* hit_point_buffer; HitRecord* hit_buffer; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu index 6805c336..3d75cf8d 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu @@ -303,7 +303,7 @@ extern "C" __global__ void __raygen__sun_source() const uint3 launch_idx = optixGetLaunchIndex(); // Index of the current launch thread const uint3 launch_dims = optixGetLaunchDimensions(); // Dimensions of the launch grid const unsigned int ray_number = launch_idx.y * launch_dims.x + launch_idx.x; // Unique ray ID - const unsigned int ray_number_global = ray_number + params.ray_offset; // Global unique ray ID + const unsigned long long ray_number_global = ray_number + params.ray_offset; // Global unique ray ID float3 sun_sample_pos; switch (params.sun_gen_type) diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index e9fd118b..fd9f8535 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -97,16 +97,6 @@ RunnerStatus OptixRunner::setup_parameters(const SimulationData *data) // Get Parameter data const SimulationParameters &sim_params = data->get_simulation_parameters(); - // ray_offset and per-ray ids are stored as uint32_t in the compaction path. - // Reject runs that would overflow before they start rather than silently - // wrapping and corrupting trimming or reported launched-ray counts. - if (sim_params.max_number_of_rays > static_cast(std::numeric_limits::max())) - { - throw std::overflow_error( - "max_number_of_rays exceeds UINT32_MAX; the OptiX runner stores " - "ray offsets and ids as uint32_t and cannot represent this run."); - } - m_sys.set_number_of_rays(sim_params.number_of_rays, sim_params.max_number_of_rays); m_sys.set_seed(static_cast(sim_params.seed)); From e609f99014051933870727f1ed97db271b75e87e Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 20 May 2026 10:47:02 -0600 Subject: [PATCH 49/60] Limit max trace depth to 255 using 8-bit unsigned integer --- .../optix_runner/OptixCSP/src/core/ray_utils.cu | 13 +++++++------ .../optix_runner/OptixCSP/src/core/ray_utils.h | 2 +- .../OptixCSP/src/core/soltrace_system.cpp | 6 +++--- .../optix_runner/OptixCSP/src/shaders/Soltrace.h | 7 ++++--- .../optix_runner/OptixCSP/src/shaders/materials.cu | 4 ++-- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu index 661bb07a..82baac26 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu @@ -27,7 +27,7 @@ namespace OptixCSP const HitRecord *__restrict__ hit_buffer, uint32_t num_rays, uint32_t max_depth, - uint32_t *__restrict__ out_record_count, + uint8_t *__restrict__ out_record_count, uint8_t *__restrict__ out_has_hit) { const uint32_t ray = blockIdx.x * blockDim.x + threadIdx.x; @@ -46,7 +46,7 @@ namespace OptixCSP } const uint8_t has_hit = (raw_count > 1) ? 1u : 0u; - out_record_count[ray] = has_hit ? raw_count : 0u; + out_record_count[ray] = has_hit ? static_cast(raw_count) : 0u; out_has_hit[ray] = has_hit; return; @@ -101,7 +101,7 @@ namespace OptixCSP { free_compaction_scratch(scratch); - CUDA_CHECK(cudaMalloc(&scratch.d_count, num_rays * sizeof(uint32_t))); + CUDA_CHECK(cudaMalloc(&scratch.d_count, num_rays * sizeof(uint8_t))); CUDA_CHECK(cudaMalloc(&scratch.d_offsets, num_rays * sizeof(uint64_t))); CUDA_CHECK(cudaMalloc(&scratch.d_has_hit, num_rays * sizeof(uint8_t))); CUDA_CHECK(cudaMalloc(&scratch.d_n_hit, sizeof(uint32_t))); @@ -111,7 +111,7 @@ namespace OptixCSP uint32_t *null_u32 = nullptr; uint8_t *null_u8 = nullptr; uint64_t *null_u64 = nullptr; - cub::DeviceScan::ExclusiveSum(scratch.d_scan_tmp, scratch.scan_bytes, null_u32, null_u64, num_rays); + cub::DeviceScan::ExclusiveSum(scratch.d_scan_tmp, scratch.scan_bytes, null_u8, null_u64, num_rays); cub::DeviceReduce::Sum(scratch.d_red_tmp, scratch.red_bytes, null_u8, null_u32, num_rays); size_t select_bytes = 0; @@ -200,9 +200,10 @@ namespace OptixCSP if (timings) t_scalar = std::chrono::high_resolution_clock::now(); uint64_t last_offset = 0; - uint32_t last_count = 0, n_hit_rays = 0; + uint8_t last_count = 0; + uint32_t n_hit_rays = 0; CUDA_CHECK(cudaMemcpy(&last_offset, scratch.d_offsets + (num_rays - 1), sizeof(uint64_t), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(&last_count, scratch.d_count + (num_rays - 1), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(&last_count, scratch.d_count + (num_rays - 1), sizeof(uint8_t), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(&n_hit_rays, scratch.d_n_hit, sizeof(uint32_t), cudaMemcpyDeviceToHost)); if (timings) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h index 06ce8369..c0783815 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h @@ -14,7 +14,7 @@ namespace OptixCSP /// as long as num_rays and max_depth stay the same. struct CompactionScratch { - uint32_t *d_count = nullptr; // per-ray output record count + uint8_t *d_count = nullptr; // per-ray output record count (bounded by max_depth ≤ 255) uint64_t *d_offsets = nullptr; // exclusive prefix-sum of d_count (pass 1); reused as global ray IDs (pass 2) uint8_t *d_has_hit = nullptr; // 1 if ray contributes records, else 0 uint32_t *d_n_hit = nullptr; // scalar: total hit rays diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index c4d45b0f..1ba693f5 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -832,11 +832,11 @@ uint_fast64_t SolTraceSystem::automatic_batch_size() const // d_compacted MAX_TRACE_DEPTH * sizeof(HitRecord) -- worst-case compacted copy // sun_dir_buffer sizeof(float3) -- sun ray direction // curand states sizeof(curandState) -- RNG state - // d_count sizeof(uint32_t) -- compaction hit count - // d_offsets sizeof(uint32_t) -- compaction prefix sum + // d_offsets sizeof(uint64_t) -- compaction prefix sum / global ray IDs + // d_count sizeof(uint8_t) -- compaction hit count (bounded by MAX_TRACE_DEPTH <= 255) // d_has_hit sizeof(uint8_t) -- per-ray hit flag const size_t bytes_per_ray = - 2u * MAX_TRACE_DEPTH * sizeof(HitRecord) + sizeof(float3) + sizeof(curandState) + 2u * sizeof(uint32_t) + sizeof(uint8_t); + 2u * MAX_TRACE_DEPTH * sizeof(HitRecord) + sizeof(float3) + sizeof(curandState) + sizeof(uint64_t) + 2u * sizeof(uint8_t); const uint_fast64_t computed = (bytes_per_ray > 0) ? static_cast(usable_bytes / bytes_per_ray) : 0u; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h index 5f8e2e63..d2e531b9 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h @@ -4,6 +4,7 @@ #include "MaterialDataST.h" #include "soltrace_constants.h" +#include #include #include #include @@ -14,8 +15,8 @@ namespace OptixCSP{ const unsigned int NUM_PAYLOAD_VALUES = 2u; // NOTE: Maximum number of ray interactions in tracing with the geometry is // MAX_TRACE_DEPTH - 1 (so currently 4). See the end of the function - // __closesthit__element in materials.cu. - const unsigned int MAX_TRACE_DEPTH = 5u; + // __closesthit__element in materials.cu. Note the type. Limited to 255. + const uint8_t MAX_TRACE_DEPTH = 5u; struct HitGroupData { @@ -52,7 +53,7 @@ namespace OptixCSP{ unsigned int width; // essentially number of rays launched and sun points unsigned int height; - int max_depth; + unsigned int max_depth; unsigned long long ray_offset; // Global offset for current branch // float4* hit_point_buffer; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu index 3eb55621..9e903f76 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu @@ -125,7 +125,7 @@ extern "C" __global__ void __closesthit__element() const float3 hit_point = ray_orig + ray_t * ray_dir; OptixCSP::PerRayData prd = OptixCSP::getPayload(); - const int new_depth = prd.depth + 1; // Increment the ray depth for recursive tracing + const unsigned int new_depth = prd.depth + 1; // Increment the ray depth for recursive tracing // we have two scenarios here // if we use refraction, then we look at transmissivity to determine if the ray will refract @@ -241,7 +241,7 @@ extern "C" __global__ void __closesthit__element() { // Get buffer slot - const int slot = params.max_depth * prd.ray_path_index + new_depth; + const unsigned int slot = params.max_depth * prd.ray_path_index + new_depth; // Store the hit point in the hit point buffer (used for visualization or further calculations) params.hit_buffer[slot].hit_point = make_float4(new_depth, hit_point); From ebb1e50cd5258c1caba7b7f16bae2b590d0360a2 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 20 May 2026 13:55:46 -0600 Subject: [PATCH 50/60] Add ray generation tests for optix runner --- .../optix_runner/CMakeLists.txt | 1 + .../ray_position_sampling_test.cpp | 348 ++++++++++++++++++ 2 files changed, 349 insertions(+) create mode 100644 google-tests/unit-tests/simulation_runner/optix_runner/ray_position_sampling_test.cpp diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt b/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt index c77660a2..bbfcf9ae 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt +++ b/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt @@ -20,6 +20,7 @@ set(OPTIX_RUNNER_TEST_SRC flat_optical_test.cpp two_plate_test.cpp sun_test.cpp + ray_position_sampling_test.cpp batch_size_test.cpp ) diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/ray_position_sampling_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/ray_position_sampling_test.cpp new file mode 100644 index 00000000..b987bf3d --- /dev/null +++ b/google-tests/unit-tests/simulation_runner/optix_runner/ray_position_sampling_test.cpp @@ -0,0 +1,348 @@ +// ray_position_sampling_test.cpp +// +// Tests for sun parallelogram position sampling (GenType::HALTON and +// GenType::RANDOM). Each test fires rays straight down onto a large flat plate +// with no sun-shape or optical errors, then inspects position[0] of every ray +// record — the ray-generation point on the sun parallelogram — to verify the +// spatial distribution. + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +using SolTrace::Runner::RunnerStatus; + +namespace +{ + // ---- Minimal KS-test infrastructure ------------------------------------ + // (duplicated from sun_test.cpp — kept local to this TU) + + static double clamp01(double x) + { + return x <= 0.0 ? 0.0 : (x >= 1.0 ? 1.0 : x); + } + + static double ks_statistic(const std::vector& samples, + const std::function& cdf) + { + if (samples.empty()) return 1.0; + std::vector sorted = samples; + std::sort(sorted.begin(), sorted.end()); + const double n = static_cast(sorted.size()); + double d = 0.0; + for (size_t i = 0; i < sorted.size(); ++i) + { + const double f = clamp01(cdf(sorted[i])); + const double emp_lo = static_cast(i) / n; + const double emp_hi = static_cast(i + 1) / n; + d = std::max(d, std::abs(f - emp_lo)); + d = std::max(d, std::abs(emp_hi - f)); + } + return d; + } + + static double ks_pvalue_asymptotic(double d, size_t n) + { + if (n == 0) return 0.0; + if (d <= 0.0) return 1.0; + const double sqrtn = std::sqrt(static_cast(n)); + const double x = (sqrtn + 0.12 + 0.11 / sqrtn) * d; + double sum = 0.0; + for (int k = 1; k <= 100; ++k) + { + const double term = std::exp(-2.0 * k * k * x * x); + sum += (k % 2 == 1) ? term : -term; + if (term < 1.0e-12) break; + } + return clamp01(2.0 * sum); + } + + static double ks_pvalue(const std::vector& samples, + const std::function& cdf) + { + return ks_pvalue_asymptotic(ks_statistic(samples, cdf), samples.size()); + } + + // Test that `coords` (a 1-D projection of source positions) is consistent + // with a uniform distribution. Coordinates are normalised to [0,1] using + // the empirical range before the KS test. For N >= 1 000 from a true + // uniform distribution the normalisation bias is O(1/N) — negligible + // against the ~1% critical value. + static double ks_pvalue_uniform1d(const std::vector& coords) + { + if (coords.size() < 2) return 0.0; + const double lo = *std::min_element(coords.begin(), coords.end()); + const double hi = *std::max_element(coords.begin(), coords.end()); + if (hi - lo < 1.0e-9) return 0.0; + + std::vector u; + u.reserve(coords.size()); + for (double c : coords) + u.push_back((c - lo) / (hi - lo)); + + return ks_pvalue(u, [](double x) { return x; }); + } + + // ---- Scene helpers ----------------------------------------------------- + + // Build a SimulationData with a large flat plate at z=50 (200 × 200 world + // units), perfectly reflective, no optical or sun-shape errors. The plate + // is intentionally much larger than any realistic sun parallelogram so that + // all generated rays hit it and every source position is recorded. + static void make_large_plate_scene(SimulationData& sd, int seed = 42) + { + sd.clear(); + + auto stage = make_stage(0); + stage->set_origin(0, 0, 0); + stage->set_aim_vector(0, 0, 1); + stage->set_name("stage"); + + auto plate = make_element(); + plate->set_origin(0, 0, 50); + plate->set_aim_vector(0, 0, 100); + plate->set_surface(make_surface()); + plate->set_aperture(make_aperture(200, 200)); + OpticalProperties op(InteractionType::REFLECTION, + DistributionType::NONE, + 0, 1, 0, 0, 0, 0); + plate->set_front_optical_properties(op); + plate->set_back_optical_properties(op); + plate->set_name("plate"); + + stage->add_element(plate); + sd.add_stage(stage); + + SimulationParameters& params = sd.get_simulation_parameters(); + params.number_of_rays = 20000; + params.max_number_of_rays = params.number_of_rays * 10; + params.include_optical_errors = false; + params.include_sun_shape_errors = false; + params.seed = seed; + } + + // Add a sun pointing straight down (position overhead at z=100) to an + // already-configured SimulationData, using the requested gen_type. + static void add_sun(SimulationData& sd, SolTrace::Data::GenType gen_type) + { + auto sun = make_ray_source(); + sun->set_position(0, 0, 100); + // PILLBOX shape with no sun-shape errors keeps rays exactly parallel, + // so hit positions on the plate are the source positions projected down. + sun->set_shape(SolTrace::Data::SunShape::PILLBOX, 0.0, 4.65, 0.0); + sun->set_gen_type(gen_type); + sd.add_ray_source(sun); + } + + // Run the simulation and populate `result`. Returns false on any failure. + static bool run_sim(SimulationData& sd, SimulationResult& result) + { + OptixRunner runner; + if (runner.initialize() != RunnerStatus::SUCCESS) return false; + if (runner.setup_simulation(&sd) != RunnerStatus::SUCCESS) return false; + if (runner.run_simulation() != RunnerStatus::SUCCESS) return false; + if (runner.report_simulation(&result, 0) != RunnerStatus::SUCCESS) return false; + return true; + } + + // Extract the X and Y components of position[0] (the ray-generation point) + // for every ray that has at least one surface interaction recorded. + static void collect_source_xy(const SimulationResult& result, + std::vector& xs, + std::vector& ys) + { + xs.clear(); + ys.clear(); + for (int i = 0; i < result.get_number_of_records(); ++i) + { + auto rec = result[i]; + if (!rec || rec->get_number_of_interactions() < 1) continue; + glm::dvec3 p; + rec->get_position(0, p); + xs.push_back(p[0]); + ys.push_back(p[1]); + } + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +// GenType::RANDOM: the marginal X and Y distributions of source positions +// should be statistically consistent with a uniform distribution over the +// sun parallelogram. +TEST(RayPositionSampling, Random_UniformMarginals) +{ + SimulationData sd; + make_large_plate_scene(sd); + add_sun(sd, SolTrace::Data::GenType::RANDOM); + + SimulationResult result; + ASSERT_TRUE(run_sim(sd, result)); + ASSERT_GT(result.get_number_of_records(), 0); + + std::vector xs, ys; + collect_source_xy(result, xs, ys); + ASSERT_GE(xs.size(), 1000u); + + const double p_x = ks_pvalue_uniform1d(xs); + const double p_y = ks_pvalue_uniform1d(ys); + + EXPECT_GT(p_x, 1.0e-6) << "X marginal deviates significantly from uniform"; + EXPECT_GT(p_y, 1.0e-6) << "Y marginal deviates significantly from uniform"; +} + +// GenType::HALTON: same uniformity requirement, via the Halton low-discrepancy +// sequence (bases 2 and 3 for the two parallelogram axes). +TEST(RayPositionSampling, Halton_UniformMarginals) +{ + SimulationData sd; + make_large_plate_scene(sd); + add_sun(sd, SolTrace::Data::GenType::HALTON); + + SimulationResult result; + ASSERT_TRUE(run_sim(sd, result)); + ASSERT_GT(result.get_number_of_records(), 0); + + std::vector xs, ys; + collect_source_xy(result, xs, ys); + ASSERT_GE(xs.size(), 1000u); + + const double p_x = ks_pvalue_uniform1d(xs); + const double p_y = ks_pvalue_uniform1d(ys); + + EXPECT_GT(p_x, 1.0e-6) << "X marginal deviates significantly from uniform"; + EXPECT_GT(p_y, 1.0e-6) << "Y marginal deviates significantly from uniform"; +} + +// GenType::HALTON — float-accumulator precision test. +// +// The GPU halton() function accumulates into a float. At index ≈ 2^23 ≈ 8.4 M +// the term 1/2^23 ≈ 1.19e-7 falls below FLT_EPSILON, causing distinct indices +// to map to the same output value (clumping). A KS test against the uniform +// CDF on a sample that covers this region will detect the resulting departure +// from uniformity. +// +// This test is intentionally slow (8 M rays) and is disabled by default. +// Enable it with --gtest_also_run_disabled_tests to reproduce the precision bug. +TEST(RayPositionSampling, DISABLED_Halton_FloatPrecisionAtHighIndices) +{ + SimulationData sd; + make_large_plate_scene(sd); + sd.get_simulation_parameters().number_of_rays = 8'000'000; + sd.get_simulation_parameters().max_number_of_rays = 8'000'000 * 2; + add_sun(sd, SolTrace::Data::GenType::HALTON); + + SimulationResult result; + ASSERT_TRUE(run_sim(sd, result)); + ASSERT_GT(result.get_number_of_records(), 0); + + std::vector xs, ys; + collect_source_xy(result, xs, ys); + ASSERT_GE(xs.size(), 1'000'000u); + + // With a correctly implemented double-accumulator Halton sequence the + // p-value here should be well above 1e-6. With the float accumulator the + // clumping at high indices causes the KS statistic to spike and this + // assertion fails. + const double p_x = ks_pvalue_uniform1d(xs); + const double p_y = ks_pvalue_uniform1d(ys); + + EXPECT_GT(p_x, 1.0e-6) + << "X marginal: float-accumulator clumping detected at high Halton indices"; + EXPECT_GT(p_y, 1.0e-6) + << "Y marginal: float-accumulator clumping detected at high Halton indices"; +} + +// GenType::HALTON is deterministic: the seed field has no effect because the +// sequence depends only on the ray index. Two runs with different seeds must +// produce the same set of source positions. +TEST(RayPositionSampling, Halton_Deterministic) +{ + // Run 1: seed = 1 + SimulationData sd1; + make_large_plate_scene(sd1, /*seed=*/1); + add_sun(sd1, SolTrace::Data::GenType::HALTON); + + SimulationResult r1; + ASSERT_TRUE(run_sim(sd1, r1)); + + // Run 2: seed = 99999 (different, should be ignored by Halton) + SimulationData sd2; + make_large_plate_scene(sd2, /*seed=*/99999); + add_sun(sd2, SolTrace::Data::GenType::HALTON); + + SimulationResult r2; + ASSERT_TRUE(run_sim(sd2, r2)); + + ASSERT_EQ(r1.get_number_of_records(), r2.get_number_of_records()); + + std::vector xs1, ys1, xs2, ys2; + collect_source_xy(r1, xs1, ys1); + collect_source_xy(r2, xs2, ys2); + ASSERT_EQ(xs1.size(), xs2.size()); + + // Sort both and compare element-by-element. + std::sort(xs1.begin(), xs1.end()); + std::sort(xs2.begin(), xs2.end()); + std::sort(ys1.begin(), ys1.end()); + std::sort(ys2.begin(), ys2.end()); + + for (size_t i = 0; i < xs1.size(); ++i) + { + EXPECT_NEAR(xs1[i], xs2[i], 1.0e-4) + << "Halton X differs at sorted index " << i + << " — sequence is not deterministic across seeds"; + EXPECT_NEAR(ys1[i], ys2[i], 1.0e-4) + << "Halton Y differs at sorted index " << i + << " — sequence is not deterministic across seeds"; + } +} + +// GenType::RANDOM IS seed-dependent: two runs with different seeds must +// produce distinguishably different position sets. +TEST(RayPositionSampling, Random_SeedDependent) +{ + SimulationData sd1; + make_large_plate_scene(sd1, /*seed=*/1); + add_sun(sd1, SolTrace::Data::GenType::RANDOM); + + SimulationData sd2; + make_large_plate_scene(sd2, /*seed=*/2); + add_sun(sd2, SolTrace::Data::GenType::RANDOM); + + SimulationResult r1, r2; + ASSERT_TRUE(run_sim(sd1, r1)); + ASSERT_TRUE(run_sim(sd2, r2)); + ASSERT_GT(r1.get_number_of_records(), 0); + ASSERT_GT(r2.get_number_of_records(), 0); + + std::vector xs1, ys1, xs2, ys2; + collect_source_xy(r1, xs1, ys1); + collect_source_xy(r2, xs2, ys2); + ASSERT_FALSE(xs1.empty()); + ASSERT_FALSE(xs2.empty()); + + const double mean_x1 = std::accumulate(xs1.begin(), xs1.end(), 0.0) / xs1.size(); + const double mean_x2 = std::accumulate(xs2.begin(), xs2.end(), 0.0) / xs2.size(); + const double mean_y1 = std::accumulate(ys1.begin(), ys1.end(), 0.0) / ys1.size(); + const double mean_y2 = std::accumulate(ys2.begin(), ys2.end(), 0.0) / ys2.size(); + + // With 20 000 rays and different seeds the probability that both means + // coincide to within 1e-4 is negligible. + const bool x_differs = std::abs(mean_x1 - mean_x2) > 1.0e-4; + const bool y_differs = std::abs(mean_y1 - mean_y2) > 1.0e-4; + + EXPECT_TRUE(x_differs || y_differs) + << "Seeds 1 and 2 produced indistinguishable mean source positions; " + "seed variation may not be wired up for RANDOM gen_type."; +} From b347a6e1067df9d917bda06d17857bcc5f0aac1a Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 26 May 2026 12:35:29 -0600 Subject: [PATCH 51/60] Fix halton generation method; add warning for exceeding 32-bit uint max with max rays; add/remove tests --- .../optix_runner/OptixCSP/src/shaders/sun.cu | 126 +++++++++--------- .../optix_runner/optix_runner.cpp | 11 ++ .../ray_position_sampling_test.cpp | 39 ------ .../optix_runner/sun_test.cpp | 53 ++++++++ 4 files changed, 127 insertions(+), 102 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu index 3d75cf8d..fd83a085 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/sun.cu @@ -2,25 +2,29 @@ #include #include -//#include -//#include +// #include +// #include #include "Soltrace.h" #include "soltrace_constants.h" #include // Launch parameters for soltrace -extern "C" { +extern "C" +{ __constant__ OptixCSP::LaunchParams params; } -namespace OptixCSP { +namespace OptixCSP +{ // Halton sequence generator, used for quasi-random sampling // Generates a Halton sequence value for a given index and base - __device__ float halton(int index, int base) { + __device__ float halton(unsigned int index, unsigned int base) + { float f = 1.0f, result = 0.0f; - while (index > 0) { + while (index > 0) + { f = f / base; result = result + f * (index % base); index = index / base; @@ -30,7 +34,8 @@ namespace OptixCSP { // Generate a sample point within a parallelogram defined by the AABB (Axis-Aligned Bounding Box) // Uses the Halton sequence for sampling - __device__ float3 haltonSampleInParallelogram(unsigned int sample_index) { + __device__ float3 haltonSampleInParallelogram(unsigned int sample_index) + { // Generate Halton sequence values float u = halton(sample_index, 2); // Base 2 for x float v = halton(sample_index, 3); // Base 3 for y @@ -58,7 +63,8 @@ namespace OptixCSP { } // Sample a random ray direction within a cone defined by a maximum angle - __device__ float3 sampleRayDirectionInCone_Pillbox(float3 dir, float half_angle, unsigned int ray_number) { + __device__ float3 sampleRayDirectionInCone_Pillbox(float3 dir, float half_angle, unsigned int ray_number) + { curandState rng_state = params.rng_states[ray_number]; const float half_angle_mrad = half_angle; @@ -89,10 +95,11 @@ namespace OptixCSP { return normalize(sin_t * (cosf(phi) * u + sinf(phi) * v) + cos_t * w); } - __device__ float3 sampleRayDirectionInCone_Gaussian(float3 dir, float sigma, unsigned int ray_number) { + __device__ float3 sampleRayDirectionInCone_Gaussian(float3 dir, float sigma, unsigned int ray_number) + { curandState rng = params.rng_states[ray_number]; - const float sigma_rad = sigma * 0.001f; // Convert to rad + const float sigma_rad = sigma * 0.001f; // Convert to rad // Build an orthonormal basis float3 w = normalize(dir); @@ -117,7 +124,7 @@ namespace OptixCSP { { curandState rng = params.rng_states[ray_number]; - const float max_angle_mrad = params.sun_max_angle; // [mrad] + const float max_angle_mrad = params.sun_max_angle; // [mrad] // Orthonormal basis about dir float3 w = normalize(dir); @@ -139,7 +146,7 @@ namespace OptixCSP { theta2 = thetax * thetax + thetay * thetay; theta = sqrtf(theta2); - if (theta <= 4.65f) // within solar disc (mrad, as in CPU code) + if (theta <= 4.65f) // within solar disc (mrad, as in CPU code) { // stest = cos(0.326 * theta) / cos(0.308 * theta); float t = theta; @@ -166,8 +173,7 @@ namespace OptixCSP { float3 local_dir = make_float3( sin_t * cosf(phi), sin_t * sinf(phi), - cos_t - ); + cos_t); params.rng_states[ray_number] = rng; @@ -214,8 +220,7 @@ namespace OptixCSP { float3 local_dir = make_float3( sin_t * cosf(phi), sin_t * sinf(phi), - cos_t - ); + cos_t); params.rng_states[ray_number] = rng; @@ -224,8 +229,8 @@ namespace OptixCSP { return world_dir; } - __device__ float3 sampleRayDirectionInCone_UserDefined(float3 dir, int user_capacity, float* user_angle, - float* user_intensity, unsigned int ray_number) + __device__ float3 sampleRayDirectionInCone_UserDefined(float3 dir, int user_capacity, float *user_angle, + float *user_intensity, unsigned int ray_number) { curandState rng = params.rng_states[ray_number]; @@ -234,7 +239,7 @@ namespace OptixCSP { return normalize(dir); } - const float max_angle_mrad = params.sun_max_angle; // [mrad] + const float max_angle_mrad = params.sun_max_angle; // [mrad] const float max_int = params.sun_max_intensity; // Orthonormal basis about dir @@ -266,8 +271,7 @@ namespace OptixCSP { stest = user_intensity[i]; else { - stest = user_intensity[i - 1] + (user_intensity[i] - user_intensity[i - 1]) * (theta - user_angle[i - 1]) - / denom; + stest = user_intensity[i - 1] + (user_intensity[i] - user_intensity[i - 1]) * (theta - user_angle[i - 1]) / denom; } } @@ -285,8 +289,7 @@ namespace OptixCSP { float3 local_dir = make_float3( sin_t * cosf(phi), sin_t * sinf(phi), - cos_t - ); + cos_t); params.rng_states[ray_number] = rng; @@ -300,22 +303,22 @@ namespace OptixCSP { extern "C" __global__ void __raygen__sun_source() { // Lookup location in launch grid - const uint3 launch_idx = optixGetLaunchIndex(); // Index of the current launch thread - const uint3 launch_dims = optixGetLaunchDimensions(); // Dimensions of the launch grid - const unsigned int ray_number = launch_idx.y * launch_dims.x + launch_idx.x; // Unique ray ID - const unsigned long long ray_number_global = ray_number + params.ray_offset; // Global unique ray ID + const uint3 launch_idx = optixGetLaunchIndex(); // Index of the current launch thread + const uint3 launch_dims = optixGetLaunchDimensions(); // Dimensions of the launch grid + const unsigned int ray_number = launch_idx.y * launch_dims.x + launch_idx.x; // Unique ray ID + const unsigned long long ray_number_global = ray_number + params.ray_offset; // Global unique ray ID float3 sun_sample_pos; switch (params.sun_gen_type) { - case(OptixCSP::GenType::RANDOM): - sun_sample_pos = OptixCSP::randomSampleInParallelogram(ray_number); - break; - case(OptixCSP::GenType::HALTON): - sun_sample_pos = OptixCSP::haltonSampleInParallelogram(ray_number_global); - break; - default: - return; + case (OptixCSP::GenType::RANDOM): + sun_sample_pos = OptixCSP::randomSampleInParallelogram(ray_number); + break; + case (OptixCSP::GenType::HALTON): + sun_sample_pos = OptixCSP::haltonSampleInParallelogram(ray_number_global); + break; + default: + return; } // Sample emission angle here - capturing sun distribution @@ -329,35 +332,34 @@ extern "C" __global__ void __raygen__sun_source() { switch (params.sun_shape) { - case(OptixCSP::SunShape::PILLBOX): - ray_dir = OptixCSP::sampleRayDirectionInCone_Pillbox(init_ray_dir, params.half_width, ray_number); - break; - case(OptixCSP::SunShape::GAUSSIAN): - ray_dir = OptixCSP::sampleRayDirectionInCone_Gaussian(init_ray_dir, params.sigma, ray_number); - break; - case(OptixCSP::SunShape::BUIE_CSR): - ray_dir = OptixCSP::sampleRayDirectionInCone_BuieCSR(init_ray_dir, params.buie_kappa, params.buie_gamma, ray_number); - break; - case(OptixCSP::SunShape::LIMBDARKENED): - ray_dir = OptixCSP::sampleRayDirectionInCone_LimbDarkened(init_ray_dir, ray_number); - break; - case(OptixCSP::SunShape::USER_DEFINED): - ray_dir = OptixCSP::sampleRayDirectionInCone_UserDefined(init_ray_dir, params.sun_user_capacity, - params.sun_user_angle, params.sun_user_intensity, ray_number); - break; - default: - assert(false); - // Just return since the sun shape is not supported - return; + case (OptixCSP::SunShape::PILLBOX): + ray_dir = OptixCSP::sampleRayDirectionInCone_Pillbox(init_ray_dir, params.half_width, ray_number); + break; + case (OptixCSP::SunShape::GAUSSIAN): + ray_dir = OptixCSP::sampleRayDirectionInCone_Gaussian(init_ray_dir, params.sigma, ray_number); + break; + case (OptixCSP::SunShape::BUIE_CSR): + ray_dir = OptixCSP::sampleRayDirectionInCone_BuieCSR(init_ray_dir, params.buie_kappa, params.buie_gamma, ray_number); + break; + case (OptixCSP::SunShape::LIMBDARKENED): + ray_dir = OptixCSP::sampleRayDirectionInCone_LimbDarkened(init_ray_dir, ray_number); + break; + case (OptixCSP::SunShape::USER_DEFINED): + ray_dir = OptixCSP::sampleRayDirectionInCone_UserDefined(init_ray_dir, params.sun_user_capacity, + params.sun_user_angle, params.sun_user_intensity, ray_number); + break; + default: + assert(false); + // Just return since the sun shape is not supported + return; } } else { ray_dir = init_ray_dir; } - - - //float3 ray_dir = OptixCSP::sampleRayDirectionInCone_Gaussian(init_ray_dir, params.max_sun_angle, ray_number); + + // float3 ray_dir = OptixCSP::sampleRayDirectionInCone_Gaussian(init_ray_dir, params.max_sun_angle, ray_number); // Create the PerRayData structure to track ray state (e.g., path index and recursion depth) OptixCSP::PerRayData prd; @@ -368,7 +370,6 @@ extern "C" __global__ void __raygen__sun_source() params.hit_buffer[params.max_depth * prd.ray_path_index].element_id = OptixCSP::kElementIdRayGen; params.hit_buffer[params.max_depth * prd.ray_path_index].hit_type = OptixCSP::HitType::HIT_CREATE; params.sun_dir_buffer[prd.ray_path_index] = ray_dir; - // Cast and trace the ray through the scene optixTrace( @@ -383,7 +384,6 @@ extern "C" __global__ void __raygen__sun_source() OptixCSP::RAY_TYPE_RADIANCE, // Ray type (radiance for sunlight) OptixCSP::RAY_TYPE_COUNT, // Number of ray types OptixCSP::RAY_TYPE_RADIANCE, // SBT offset (ray type to launch) - reinterpret_cast(prd.ray_path_index), - reinterpret_cast(prd.depth) - ); -} \ No newline at end of file + reinterpret_cast(prd.ray_path_index), + reinterpret_cast(prd.depth)); +} diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index fd9f8535..c7724924 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -140,6 +140,17 @@ RunnerStatus OptixRunner::setup_sun(const SimulationData *data) } } + // Warn if Halton sampling is used with more rays than uint32_t can index, + // since the Halton sequence index is truncated to 32 bits causing repeated positions. + if (sun->get_gen_type() == SolTrace::Data::GenType::HALTON && + data->get_simulation_parameters().max_number_of_rays > static_cast(std::numeric_limits::max())) + { + std::cerr << "Warning: max_number_of_rays exceeds 32-bit unsigned int maximum (" + << std::numeric_limits::max() + << ") with Halton ray generation. Halton sequence positions will repeat after index " + << std::numeric_limits::max() << "." << std::endl; + } + return RunnerStatus::SUCCESS; } diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/ray_position_sampling_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/ray_position_sampling_test.cpp index b987bf3d..244a9687 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/ray_position_sampling_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/ray_position_sampling_test.cpp @@ -224,45 +224,6 @@ TEST(RayPositionSampling, Halton_UniformMarginals) EXPECT_GT(p_y, 1.0e-6) << "Y marginal deviates significantly from uniform"; } -// GenType::HALTON — float-accumulator precision test. -// -// The GPU halton() function accumulates into a float. At index ≈ 2^23 ≈ 8.4 M -// the term 1/2^23 ≈ 1.19e-7 falls below FLT_EPSILON, causing distinct indices -// to map to the same output value (clumping). A KS test against the uniform -// CDF on a sample that covers this region will detect the resulting departure -// from uniformity. -// -// This test is intentionally slow (8 M rays) and is disabled by default. -// Enable it with --gtest_also_run_disabled_tests to reproduce the precision bug. -TEST(RayPositionSampling, DISABLED_Halton_FloatPrecisionAtHighIndices) -{ - SimulationData sd; - make_large_plate_scene(sd); - sd.get_simulation_parameters().number_of_rays = 8'000'000; - sd.get_simulation_parameters().max_number_of_rays = 8'000'000 * 2; - add_sun(sd, SolTrace::Data::GenType::HALTON); - - SimulationResult result; - ASSERT_TRUE(run_sim(sd, result)); - ASSERT_GT(result.get_number_of_records(), 0); - - std::vector xs, ys; - collect_source_xy(result, xs, ys); - ASSERT_GE(xs.size(), 1'000'000u); - - // With a correctly implemented double-accumulator Halton sequence the - // p-value here should be well above 1e-6. With the float accumulator the - // clumping at high indices causes the KS statistic to spike and this - // assertion fails. - const double p_x = ks_pvalue_uniform1d(xs); - const double p_y = ks_pvalue_uniform1d(ys); - - EXPECT_GT(p_x, 1.0e-6) - << "X marginal: float-accumulator clumping detected at high Halton indices"; - EXPECT_GT(p_y, 1.0e-6) - << "Y marginal: float-accumulator clumping detected at high Halton indices"; -} - // GenType::HALTON is deterministic: the seed field has no effect because the // sequence depends only on the ray index. Two runs with different seeds must // produce the same set of source positions. diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp index c2ff398e..96c7af34 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp @@ -804,3 +804,56 @@ TEST(Sun, UserDefinedSunAngleDistribution) EXPECT_GT(frac_beyond_disc, 0.0); EXPECT_LT(frac_beyond_disc, 0.05); } + +// Verify that a warning is printed to stderr when Halton ray generation is used +// with max_number_of_rays exceeding UINT32_MAX, since the Halton index is truncated +// to 32 bits and positions will repeat. +TEST(Sun, HaltonWarningWhenExceedingUInt32Max) +{ + SimulationData sd; + element_ptr plate; + make_default_sd_sun(sd, plate); + + auto sun = make_ray_source(); + sun->set_position(0, 0, 100); + sun->set_gen_type(SolTrace::Data::GenType::HALTON); + sd.add_ray_source(sun); + + SimulationParameters& params = sd.get_simulation_parameters(); + params.number_of_rays = 1000; + params.max_number_of_rays = static_cast(std::numeric_limits::max()) + 1; + + OptixRunner runner; + testing::internal::CaptureStderr(); + runner.setup_simulation(&sd); + const std::string output = testing::internal::GetCapturedStderr(); + + EXPECT_NE(output.find("Warning"), std::string::npos) + << "Expected a warning about Halton index overflow in stderr, but got: " << output; +} + +// Verify that no warning is printed when Halton is used with max_number_of_rays +// within the 32-bit range. +TEST(Sun, NoHaltonWarningWithinUInt32Range) +{ + SimulationData sd; + element_ptr plate; + make_default_sd_sun(sd, plate); + + auto sun = make_ray_source(); + sun->set_position(0, 0, 100); + sun->set_gen_type(SolTrace::Data::GenType::HALTON); + sd.add_ray_source(sun); + + SimulationParameters& params = sd.get_simulation_parameters(); + params.number_of_rays = 1000; + params.max_number_of_rays = static_cast(std::numeric_limits::max()); + + OptixRunner runner; + testing::internal::CaptureStderr(); + runner.setup_simulation(&sd); + const std::string output = testing::internal::GetCapturedStderr(); + + EXPECT_EQ(output.find("Warning"), std::string::npos) + << "Unexpected Halton warning in stderr: " << output; +} From bfdec00b9fd26ae0aebef799c7ddb7f80b7f921e Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 26 May 2026 13:03:01 -0600 Subject: [PATCH 52/60] Make max trace depth a user accesible parameter at the optix runner level --- .../OptixCSP/src/core/pipeline_manager.cpp | 5 ++--- .../OptixCSP/src/core/pipeline_manager.h | 3 +++ .../OptixCSP/src/core/soltrace_system.cpp | 12 +++++++----- .../OptixCSP/src/core/soltrace_system.h | 6 ++++++ .../OptixCSP/src/shaders/Soltrace.h | 4 ++-- .../optix_runner/optix_runner.cpp | 18 ++++++++++++++++++ .../optix_runner/optix_runner.hpp | 5 +++++ 7 files changed, 43 insertions(+), 10 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp index b012e95a..bc488eb4 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp @@ -172,8 +172,7 @@ void pipelineManager::createPipeline() // Link program groups to pipeline OptixPipelineLinkOptions pipeline_link_options = {}; - // TODO max trace belong to who? - pipeline_link_options.maxTraceDepth = MAX_TRACE_DEPTH; // Maximum recursion depth for ray tracing. + pipeline_link_options.maxTraceDepth = m_max_trace_depth; // Maximum recursion depth for ray tracing. // Create the OptiX pipeline by linking the program groups. OPTIX_CHECK(optixPipelineCreate( @@ -200,7 +199,7 @@ void pipelineManager::createPipeline() // Compute stack sizes based on the maximum trace depth and other settings. OPTIX_CHECK(optixUtilComputeStackSizes( &stack_sizes, // Input stack sizes. - MAX_TRACE_DEPTH, // Maximum trace depth. + m_max_trace_depth, // Maximum trace depth. 0, // maxCCDepth: Maximum depth of continuation callables (none in this case). 0, // maxDCDepth: Maximum depth of direct callables (none in this case). &direct_callable_stack_size_from_traversal, // Output: Stack size for callable traversal. diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.h index 0d77791b..510b39f8 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.h @@ -34,6 +34,8 @@ namespace OptixCSP void set_verbose(bool verbose) { m_verbose = verbose; } + void set_max_trace_depth(uint8_t depth) { m_max_trace_depth = depth; } + /** * @brief Loads PTX code from a file. * @param kernelName The name of the PTX kernel file to load. @@ -112,6 +114,7 @@ namespace OptixCSP std::vector m_program_groups; ///< Stores all created OptiX program groups. std::map m_intersection_program_group_map; ///< Map surface-aperture combinations to index in m_program_groups bool m_verbose = false; + uint8_t m_max_trace_depth = DEFAULT_MAX_TRACE_DEPTH; // // Number of program groups categorized by type. // int num_raygen_programs = 1; ///< Number of ray generation programs. diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 1ba693f5..8ed27894 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -36,6 +36,7 @@ SolTraceSystem::SolTraceSystem() : m_number_of_rays(0), m_max_number_of_rays(0), m_batch_size(0), + m_max_ray_depth(DEFAULT_MAX_TRACE_DEPTH), m_verbose(false), m_mem_free_before(0), m_mem_free_after(0), @@ -242,6 +243,7 @@ void SolTraceSystem::initialize() // Pipeline setup. m_timer_pipeline.reset(); m_timer_pipeline.start(); + pipeline_manager->set_max_trace_depth(m_max_ray_depth); pipeline_manager->createPipeline(); m_timer_pipeline.stop(); @@ -630,7 +632,7 @@ void SolTraceSystem::allocate_device_buffers() const uint_fast64_t effective_batch = determine_batch_size(); data_manager->launch_params_H.width = static_cast(effective_batch); data_manager->launch_params_H.height = 1; - data_manager->launch_params_H.max_depth = MAX_TRACE_DEPTH; + data_manager->launch_params_H.max_depth = m_max_ray_depth; const size_t hit_buffer_size = static_cast(data_manager->launch_params_H.width) * static_cast(data_manager->launch_params_H.height) * static_cast(data_manager->launch_params_H.max_depth) * sizeof(HitRecord); const size_t sun_dir_size = static_cast(data_manager->launch_params_H.width) * static_cast(data_manager->launch_params_H.height) * sizeof(float3); @@ -828,15 +830,15 @@ uint_fast64_t SolTraceSystem::automatic_batch_size() const // Per-ray device memory charged by allocate_device_buffers() and // allocate_compaction_scratch(): - // hit_buffer MAX_TRACE_DEPTH * sizeof(HitRecord) -- trace output - // d_compacted MAX_TRACE_DEPTH * sizeof(HitRecord) -- worst-case compacted copy + // hit_buffer DEFAULT_MAX_TRACE_DEPTH * sizeof(HitRecord) -- trace output + // d_compacted DEFAULT_MAX_TRACE_DEPTH * sizeof(HitRecord) -- worst-case compacted copy // sun_dir_buffer sizeof(float3) -- sun ray direction // curand states sizeof(curandState) -- RNG state // d_offsets sizeof(uint64_t) -- compaction prefix sum / global ray IDs - // d_count sizeof(uint8_t) -- compaction hit count (bounded by MAX_TRACE_DEPTH <= 255) + // d_count sizeof(uint8_t) -- compaction hit count (bounded by DEFAULT_MAX_TRACE_DEPTH <= 255) // d_has_hit sizeof(uint8_t) -- per-ray hit flag const size_t bytes_per_ray = - 2u * MAX_TRACE_DEPTH * sizeof(HitRecord) + sizeof(float3) + sizeof(curandState) + sizeof(uint64_t) + 2u * sizeof(uint8_t); + 2u * m_max_ray_depth * sizeof(HitRecord) + sizeof(float3) + sizeof(curandState) + sizeof(uint64_t) + 2u * sizeof(uint8_t); const uint_fast64_t computed = (bytes_per_ray > 0) ? static_cast(usable_bytes / bytes_per_ray) : 0u; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index 7c3be6d4..a0c005ed 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -85,6 +85,11 @@ namespace OptixCSP } uint_fast64_t get_batch_size() const { return m_batch_size; } + /// Set the maximum ray interaction depth. Must be called before initialize(). + /// Values are clamped to [1, 255]. Defaults to DEFAULT_MAX_TRACE_DEPTH. + void set_max_ray_depth(uint8_t depth) { m_max_ray_depth = depth; } + uint8_t get_max_ray_depth() const { return m_max_ray_depth; } + void set_sun(SolTrace::Data::Sun *sun) { m_sun = sun; } void set_seed(uint64_t seed) { m_seed = seed; } // Set sun seed @@ -139,6 +144,7 @@ namespace OptixCSP uint_fast64_t m_number_of_rays; uint_fast64_t m_max_number_of_rays; uint_fast64_t m_batch_size = 0; // 0 means auto-size: determine_batch_size() calls automatic_batch_size() + uint8_t m_max_ray_depth = DEFAULT_MAX_TRACE_DEPTH; bool m_verbose; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h index d2e531b9..6eeb5420 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h @@ -14,9 +14,9 @@ namespace OptixCSP{ const unsigned int NUM_ATTRIBUTE_VALUES = 4u; const unsigned int NUM_PAYLOAD_VALUES = 2u; // NOTE: Maximum number of ray interactions in tracing with the geometry is - // MAX_TRACE_DEPTH - 1 (so currently 4). See the end of the function + // DEFAULT_MAX_TRACE_DEPTH - 1 (so currently 4). See the end of the function // __closesthit__element in materials.cu. Note the type. Limited to 255. - const uint8_t MAX_TRACE_DEPTH = 5u; + const uint8_t DEFAULT_MAX_TRACE_DEPTH = 5u; struct HitGroupData { diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index c7724924..e0d341e5 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -3,6 +3,7 @@ #include "simulation_data/simulation_data.hpp" #include "simulation_data/simulation_data_export.hpp" +#include #include #include @@ -30,6 +31,23 @@ void OptixRunner::print_timing() const m_sys.print_timing(); } +void OptixRunner::set_max_ray_depth(uint_fast64_t depth) +{ + if (depth < 2) + { + std::cerr << "[OptixRunner] WARNING: max_ray_depth (" << depth + << ") is below the minimum of 2. Clamping to 2.\n"; + depth = 2; + } + else if (depth > 255) + { + std::cerr << "[OptixRunner] WARNING: max_ray_depth (" << depth + << ") exceeds the maximum of 255. Clamping to 255.\n"; + depth = 255; + } + m_sys.set_max_ray_depth(static_cast(depth)); +} + void OptixRunner::set_batch_size(uint_fast64_t batch_size) { m_sys.set_batch_size(batch_size); diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.hpp b/coretrace/simulation_runner/optix_runner/optix_runner.hpp index 025cec37..6ef54ad6 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.hpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.hpp @@ -49,6 +49,11 @@ class OptixRunner : public SolTrace::Runner::SimulationRunner void set_batch_size(uint_fast64_t batch_size); uint_fast64_t get_batch_size() const; + /// Set the maximum ray interaction depth. Must be called before initialize(). + /// Depth is clamped to [2, 255] with a warning if either bound is exceeded. Defaults to DEFAULT_MAX_TRACE_DEPTH. + void set_max_ray_depth(uint_fast64_t depth); + uint8_t get_max_ray_depth() const { return m_sys.get_max_ray_depth(); } + /// Enable or disable trimming of excess rays at the end of run() so that /// exactly the requested number of hit rays is returned. Enabled by default. void set_trim_excess_rays(bool trim); From df0f560f4326930d1176d00f980958ecd7b017a5 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 26 May 2026 13:08:42 -0600 Subject: [PATCH 53/60] Add tests for user setting max trace depth --- .../optix_runner/CMakeLists.txt | 1 + .../optix_runner/max_ray_depth_test.cpp | 107 ++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt b/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt index bbfcf9ae..9df394d1 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt +++ b/google-tests/unit-tests/simulation_runner/optix_runner/CMakeLists.txt @@ -22,6 +22,7 @@ set(OPTIX_RUNNER_TEST_SRC sun_test.cpp ray_position_sampling_test.cpp batch_size_test.cpp + max_ray_depth_test.cpp ) add_executable(OptixRunnerUnitTests diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp new file mode 100644 index 00000000..960e0c94 --- /dev/null +++ b/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp @@ -0,0 +1,107 @@ +#include + +#include +#include // OptixCSP::DEFAULT_MAX_TRACE_DEPTH + +// --------------------------------------------------------------------------- +// set_max_ray_depth / get_max_ray_depth accessor tests (no GPU required) +// --------------------------------------------------------------------------- + +TEST(OptixRunnerMaxRayDepth, DefaultIsDefaultMaxTraceDepth) +{ + OptixRunner runner; + EXPECT_EQ(runner.get_max_ray_depth(), OptixCSP::DEFAULT_MAX_TRACE_DEPTH); +} + +TEST(OptixRunnerMaxRayDepth, SetAndGet) +{ + OptixRunner runner; + runner.set_max_ray_depth(10); + EXPECT_EQ(runner.get_max_ray_depth(), 10u); +} + +// Minimum valid depth is 2; setting exactly 2 must be accepted without clamping. +TEST(OptixRunnerMaxRayDepth, MinimumBoundaryAccepted) +{ + OptixRunner runner; + runner.set_max_ray_depth(2); + EXPECT_EQ(runner.get_max_ray_depth(), 2u); +} + +// Maximum valid depth is 255; setting exactly 255 must be accepted without clamping. +TEST(OptixRunnerMaxRayDepth, MaximumBoundaryAccepted) +{ + OptixRunner runner; + runner.set_max_ray_depth(255); + EXPECT_EQ(runner.get_max_ray_depth(), 255u); +} + +// Depth of 0 is below the minimum; must be clamped to 2 with a warning. +TEST(OptixRunnerMaxRayDepth, ZeroClampsToMinimum) +{ + OptixRunner runner; + testing::internal::CaptureStderr(); + runner.set_max_ray_depth(0); + const std::string output = testing::internal::GetCapturedStderr(); + EXPECT_EQ(runner.get_max_ray_depth(), 2u); + EXPECT_NE(output.find("WARNING"), std::string::npos); +} + +// Depth of 1 is below the minimum; must be clamped to 2 with a warning. +TEST(OptixRunnerMaxRayDepth, OneClampsToMinimum) +{ + OptixRunner runner; + testing::internal::CaptureStderr(); + runner.set_max_ray_depth(1); + const std::string output = testing::internal::GetCapturedStderr(); + EXPECT_EQ(runner.get_max_ray_depth(), 2u); + EXPECT_NE(output.find("WARNING"), std::string::npos); +} + +// Depth of 256 exceeds the maximum; must be clamped to 255 with a warning. +TEST(OptixRunnerMaxRayDepth, ExceedingMaxClampsTo255) +{ + OptixRunner runner; + testing::internal::CaptureStderr(); + runner.set_max_ray_depth(256); + const std::string output = testing::internal::GetCapturedStderr(); + EXPECT_EQ(runner.get_max_ray_depth(), 255u); + EXPECT_NE(output.find("WARNING"), std::string::npos); +} + +// Large values (e.g. passing a big int) must also clamp to 255 with a warning. +TEST(OptixRunnerMaxRayDepth, LargeValueClampsTo255) +{ + OptixRunner runner; + testing::internal::CaptureStderr(); + runner.set_max_ray_depth(10000); + const std::string output = testing::internal::GetCapturedStderr(); + EXPECT_EQ(runner.get_max_ray_depth(), 255u); + EXPECT_NE(output.find("WARNING"), std::string::npos); +} + +// An out-of-range value is always clamped and stored, replacing any previous value. +TEST(OptixRunnerMaxRayDepth, ClampedValueOverwritesPreviousValue) +{ + OptixRunner runner; + runner.set_max_ray_depth(20); + ASSERT_EQ(runner.get_max_ray_depth(), 20u); + + testing::internal::CaptureStderr(); + runner.set_max_ray_depth(0); + const std::string output = testing::internal::GetCapturedStderr(); + EXPECT_EQ(runner.get_max_ray_depth(), 2u); + EXPECT_NE(output.find("WARNING"), std::string::npos); +} + +// Setting a valid depth after an out-of-range call must work normally. +TEST(OptixRunnerMaxRayDepth, ValidSetAfterClamp) +{ + OptixRunner runner; + testing::internal::CaptureStderr(); + runner.set_max_ray_depth(0); + testing::internal::GetCapturedStderr(); + + runner.set_max_ray_depth(8); + EXPECT_EQ(runner.get_max_ray_depth(), 8u); +} From 3cc88e78b85198e4281d514b0c710154dd366e0e Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 26 May 2026 13:27:34 -0600 Subject: [PATCH 54/60] Add test checking that max trace depth is observed during tracing --- .../optix_runner/max_ray_depth_test.cpp | 69 ++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp index 960e0c94..86b657cc 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp @@ -1,7 +1,20 @@ #include #include -#include // OptixCSP::DEFAULT_MAX_TRACE_DEPTH +#include +#include +#include +#include // OptixCSP::DEFAULT_MAX_TRACE_DEPTH +#include // OptixCSP::HitType + +using SolTrace::Runner::RunnerStatus; + +// Reuse the two-plate scene defined in two_plate_test.cpp. +// Plate 1 is ideal-reflective; plate 2 is ideal-absorptive. +// Each ray that hits the scene makes exactly 2 element interactions: +// 1. reflect off plate 1 +// 2. absorb on plate 2 +void make_two_plate_sd(SimulationData &sd, element_ptr &plate1, element_ptr &plate2); // --------------------------------------------------------------------------- // set_max_ray_depth / get_max_ray_depth accessor tests (no GPU required) @@ -105,3 +118,57 @@ TEST(OptixRunnerMaxRayDepth, ValidSetAfterClamp) runner.set_max_ray_depth(8); EXPECT_EQ(runner.get_max_ray_depth(), 8u); } + +// --------------------------------------------------------------------------- +// GPU trace test: verify the depth limit is actually enforced during tracing. +// +// The two-plate scene produces exactly 2 element interactions per ray under +// normal conditions (reflect off plate 1, absorb on plate 2). Setting +// max_ray_depth = 2 reduces the maximum interactions per ray to 1, so the +// second interaction is cut off. We verify that no ray in the hit buffer +// carries more than (max_ray_depth - 1) element interactions. +// --------------------------------------------------------------------------- +TEST(OptixRunnerMaxRayDepth, TraceDepthNotExceeded) +{ + SimulationData sd; + element_ptr plate1, plate2; + make_two_plate_sd(sd, plate1, plate2); + + const uint_fast64_t test_max_depth = 2; // allows at most 1 interaction per ray + OptixRunner runner; + runner.set_max_ray_depth(test_max_depth); + ASSERT_EQ(runner.get_max_ray_depth(), static_cast(test_max_depth)); + + RunnerStatus sts = runner.initialize(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.setup_simulation(&sd); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.run_simulation(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + + OptixCSP::SolTraceSystem *sys = runner.get_optix_system(); + std::vector hp_vec; + std::vector raynumber_vec; + std::vector element_id_vec; + std::vector hit_type_vec; + sys->get_hp_output(hp_vec, raynumber_vec, element_id_vec, hit_type_vec); + + // Walk through every record. HIT_CREATE marks the start of a new ray; + // all other types are element interactions. No ray may accumulate more + // than (max_ray_depth - 1) interactions. + const uint_fast64_t max_interactions = test_max_depth - 1; + uint_fast64_t current_interactions = 0; + for (uint8_t ht : hit_type_vec) + { + if (ht == OptixCSP::HitType::HIT_CREATE) + { + current_interactions = 0; + } + else + { + ++current_interactions; + EXPECT_LE(current_interactions, max_interactions) + << "A ray exceeded max_ray_depth - 1 element interactions"; + } + } +} From 663e14292b10f7d617614f031412eaa420d752d1 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 26 May 2026 13:56:59 -0600 Subject: [PATCH 55/60] Add count of rays terminated due to max depth --- .../OptixCSP/src/core/data_manager.cpp | 21 ++++++++++++++++++- .../OptixCSP/src/core/data_manager.h | 6 ++++++ .../OptixCSP/src/core/soltrace_system.cpp | 18 ++++++++++++++++ .../OptixCSP/src/core/soltrace_system.h | 4 ++++ .../OptixCSP/src/shaders/Soltrace.h | 1 + .../OptixCSP/src/shaders/materials.cu | 5 +++++ 6 files changed, 54 insertions(+), 1 deletion(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp index 6e0e46f0..9ac1344b 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.cpp @@ -17,7 +17,8 @@ dataManager::dataManager() sun_user_intensity_D(nullptr), sun_user_capacity(0), rng_states_D(nullptr), - rng_states_capacity(0) + rng_states_capacity(0), + depth_exceeded_count_D(nullptr) { // Initialize launch parameters with default values @@ -56,6 +57,8 @@ dataManager::dataManager() launch_params_H.material_data_array_back = nullptr; launch_params_H.sun_dir_seed = 0ULL; + launch_params_H.d_depth_exceeded_count = nullptr; + launch_params_H.geometry_data_array = nullptr; launch_params_H.handle = OptixTraversableHandle{}; @@ -112,6 +115,16 @@ void dataManager::ensureCurandStates( launch_params_H.rng_states = rng_states_D; } +void dataManager::ensureDepthExceededCounter() +{ + if (depth_exceeded_count_D == nullptr) + { + CUDA_CHECK(cudaMalloc(reinterpret_cast(&depth_exceeded_count_D), sizeof(uint64_t))); + CUDA_CHECK(cudaMemset(depth_exceeded_count_D, 0, sizeof(uint64_t))); + launch_params_H.d_depth_exceeded_count = depth_exceeded_count_D; + } +} + void dataManager::allocateGeometryDataArray(std::vector geometry_data_array_H) { if (geometry_data_array_D) @@ -278,5 +291,11 @@ void dataManager::cleanup() { } launch_params_H.rng_states = nullptr; + if (depth_exceeded_count_D != nullptr) { + CUDA_CHECK(cudaFree(depth_exceeded_count_D)); + depth_exceeded_count_D = nullptr; + } + launch_params_H.d_depth_exceeded_count = nullptr; + launch_params_H.handle = OptixTraversableHandle{}; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.h index 7922d132..9d59219e 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/data_manager.h @@ -29,6 +29,9 @@ namespace OptixCSP curandState *rng_states_D; size_t rng_states_capacity; + // Device counter: rays terminated by max depth (not absorption) + uint64_t *depth_exceeded_count_D; + float *sun_user_angle_D; float *sun_user_intensity_D; size_t sun_user_capacity; @@ -68,5 +71,8 @@ namespace OptixCSP unsigned long long seed, unsigned int sequence_offset, cudaStream_t stream); + + /// Allocates the depth-exceeded counter on the device if not already allocated. + void ensureDepthExceededCounter(); }; } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 8ed27894..7a3614be 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -43,6 +43,7 @@ SolTraceSystem::SolTraceSystem() m_optical_errors(false), m_n_hit_rays(0), m_n_sun_rays(0), + m_n_depth_exceeded_rays(0), m_include_sun_shape_errors(false), m_timer_setup(), m_timer_trace(), @@ -298,6 +299,7 @@ void SolTraceSystem::run() m_hit_ray_ids.clear(); m_n_hit_rays = 0; m_n_sun_rays = 0; + m_n_depth_exceeded_rays = 0; uint_fast64_t N_ray_hit = 0; uint_fast64_t N_ray_gen = 0; @@ -358,6 +360,13 @@ void SolTraceSystem::run() } m_timer_collect_results.stop(); + // Read back depth-exceeded count for this batch + uint64_t iter_depth_exceeded = 0; + CUDA_CHECK(cudaMemcpy(&iter_depth_exceeded, + data_manager->launch_params_H.d_depth_exceeded_count, + sizeof(uint64_t), cudaMemcpyDeviceToHost)); + m_n_depth_exceeded_rays += iter_depth_exceeded; + N_ray_hit = m_n_hit_rays; N_ray_gen += width; m_n_sun_rays = N_ray_gen; @@ -383,6 +392,11 @@ void SolTraceSystem::run() m_timer_trace.stop(); + if (m_n_depth_exceeded_rays > 0) + std::cout << "[SolTraceSystem] " << m_n_depth_exceeded_rays + << " ray(s) were terminated due to reaching max_depth (" + << static_cast(data_manager->launch_params_H.max_depth) << ").\n"; + if (m_verbose) { const double t_setup = m_timer_setup_buffer.get_time_sec(); @@ -521,6 +535,7 @@ void SolTraceSystem::reset() m_hit_ray_ids.clear(); m_n_hit_rays = 0; m_n_sun_rays = 0; + m_n_depth_exceeded_rays = 0; m_sun = nullptr; m_number_of_rays = 0; @@ -667,6 +682,8 @@ void SolTraceSystem::allocate_device_buffers() data_manager->launch_params_H.sun_dir_seed, 0, m_state.stream); + + data_manager->ensureDepthExceededCounter(); } void SolTraceSystem::setup_device_buffer() @@ -676,6 +693,7 @@ void SolTraceSystem::setup_device_buffer() CUDA_CHECK(cudaMemset(data_manager->launch_params_H.hit_buffer, 0, hit_buffer_size)); CUDA_CHECK(cudaMemset(data_manager->launch_params_H.sun_dir_buffer, 0, sun_dir_size)); + CUDA_CHECK(cudaMemset(data_manager->launch_params_H.d_depth_exceeded_count, 0, sizeof(uint64_t))); data_manager->updateLaunchParams(); } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index a0c005ed..0831404a 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -129,6 +129,9 @@ namespace OptixCSP /// Returns the number of rays that hit at least one element. uint_fast64_t get_N_hit_rays() const { return m_n_hit_rays; } + + /// Returns the number of rays terminated by max depth (excludes absorption at max depth). + uint_fast64_t get_N_depth_exceeded_rays() const { return m_n_depth_exceeded_rays; } void set_sun_shape_errors(bool flag) { this->m_include_sun_shape_errors = flag; } /// Enable or disable trimming excess rays at the end of run() so that @@ -145,6 +148,7 @@ namespace OptixCSP uint_fast64_t m_max_number_of_rays; uint_fast64_t m_batch_size = 0; // 0 means auto-size: determine_batch_size() calls automatic_batch_size() uint8_t m_max_ray_depth = DEFAULT_MAX_TRACE_DEPTH; + uint_fast64_t m_n_depth_exceeded_rays = 0; // rays stopped by max depth, not absorption bool m_verbose; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h index 6eeb5420..1c4c87ab 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/Soltrace.h @@ -63,6 +63,7 @@ namespace OptixCSP{ OptixTraversableHandle handle; // int32_t* element_id_buffer; // uint8_t* hit_type_buffer; + uint64_t* d_depth_exceeded_count; // Atomic counter: rays stopped by max depth, not absorption float3 sun_vector; diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu index 9e903f76..004dbf73 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/shaders/materials.cu @@ -285,6 +285,11 @@ extern "C" __global__ void __closesthit__element() prd.depth = params.max_depth; // terminate the ray by setting depth to max depth } } + else if (!absorbed) + { + // Ray hit an element but max depth was reached; count it (absorption at this depth does not count). + atomicAdd(reinterpret_cast(params.d_depth_exceeded_count), 1ULL); + } setPayload(prd); } From 1b53f8a0c489a67641b051ec7d13fe4c25ae800a Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 26 May 2026 14:07:19 -0600 Subject: [PATCH 56/60] Add test for counting depth terminated rays; expose trace terminated ray count through optix runner --- .../optix_runner/optix_runner.hpp | 4 ++ .../optix_runner/max_ray_depth_test.cpp | 59 +++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.hpp b/coretrace/simulation_runner/optix_runner/optix_runner.hpp index 6ef54ad6..98c87c60 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.hpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.hpp @@ -54,6 +54,10 @@ class OptixRunner : public SolTrace::Runner::SimulationRunner void set_max_ray_depth(uint_fast64_t depth); uint8_t get_max_ray_depth() const { return m_sys.get_max_ray_depth(); } + /// Returns the number of rays terminated by reaching max_depth without being absorbed. + /// Valid after run_simulation() completes; resets to 0 at the start of each run. + uint_fast64_t get_N_depth_exceeded_rays() const { return m_sys.get_N_depth_exceeded_rays(); } + /// Enable or disable trimming of excess rays at the end of run() so that /// exactly the requested number of hit rays is returned. Enabled by default. void set_trim_excess_rays(bool trim); diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp index 86b657cc..33581ab2 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp @@ -172,3 +172,62 @@ TEST(OptixRunnerMaxRayDepth, TraceDepthNotExceeded) } } } + +// --------------------------------------------------------------------------- +// GPU trace tests: verify the depth-exceeded counter behaves correctly. +// --------------------------------------------------------------------------- + +// With max_depth at the default (5), the two-plate scene (reflect→absorb) uses +// only 2 depth slots per ray — well within the limit. The counter must stay 0. +TEST(OptixRunnerMaxRayDepth, DepthExceededCounterIsZeroWhenDepthNotExceeded) +{ + SimulationData sd; + element_ptr plate1, plate2; + make_two_plate_sd(sd, plate1, plate2); + + OptixRunner runner; + // Leave max_ray_depth at the default (5); the scene only needs depth 2. + RunnerStatus sts = runner.initialize(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.setup_simulation(&sd); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.run_simulation(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + + EXPECT_EQ(runner.get_N_depth_exceeded_rays(), 0u); +} + +// With max_depth=2, the two-plate scene where BOTH plates are reflective forces +// every ray to exceed the depth limit on its second hit (plate 2). +// Each ray that hit plate 1 (and reflected toward plate 2) must increment the +// counter exactly once, so the counter must equal the number of hit rays. +TEST(OptixRunnerMaxRayDepth, DepthExceededCounterCountsTerminatedReflectedRays) +{ + SimulationData sd; + element_ptr plate1, plate2; + make_two_plate_sd(sd, plate1, plate2); + + // Override plate 2 to be reflective so hitting it at max_depth triggers the counter. + plate2->get_front_optical_properties()->set_ideal_reflection(); + plate2->get_back_optical_properties()->set_ideal_reflection(); + + OptixRunner runner; + runner.set_max_ray_depth(2); // max interactions per ray = 1 (plate 1 only) + ASSERT_EQ(runner.get_max_ray_depth(), 2u); + + RunnerStatus sts = runner.initialize(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.setup_simulation(&sd); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + sts = runner.run_simulation(); + ASSERT_EQ(sts, RunnerStatus::SUCCESS); + + const uint_fast64_t counter = runner.get_N_depth_exceeded_rays(); + const uint_fast64_t n_hit = runner.get_number_rays_traced(); + + // Every ray that hit plate 1 (and was therefore included in hit output) + // also headed toward plate 2, where it would have been cut off. + EXPECT_GT(counter, 0u) << "Expected depth-exceeded counter to be non-zero"; + EXPECT_EQ(counter, n_hit) + << "Each hit ray should have triggered exactly one depth-exceeded event on plate 2"; +} From 5f06f166e97f274b7cc502fa7123a6dde458b6e2 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Tue, 26 May 2026 16:08:40 -0600 Subject: [PATCH 57/60] Attempt to fix occasional post compile optix runner seg fault; relax max trace depth count test --- .../OptixCSP/src/core/pipeline_manager.cpp | 6 +++++ .../OptixCSP/src/core/soltrace_system.cpp | 12 ++++++--- .../OptixCSP/src/core/soltrace_system.h | 12 ++++++--- .../optix_runner/max_ray_depth_test.cpp | 25 ++++++++++--------- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp index bc488eb4..ade2345d 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/pipeline_manager.cpp @@ -118,6 +118,7 @@ void pipelineManager::loadModules() // Geometry module. { std::string ptx = loadPtxFromFile("intersection"); + LOG_SIZE = sizeof(LOG); OPTIX_CHECK(optixModuleCreate( m_state.context, &moduleCompileOptions, @@ -130,6 +131,7 @@ void pipelineManager::loadModules() // Shading/materials module. { std::string ptx = loadPtxFromFile("materials"); + LOG_SIZE = sizeof(LOG); OPTIX_CHECK(optixModuleCreate( m_state.context, &moduleCompileOptions, @@ -142,6 +144,7 @@ void pipelineManager::loadModules() // Sun module. { std::string ptx = loadPtxFromFile("sun"); + LOG_SIZE = sizeof(LOG); OPTIX_CHECK(optixModuleCreate( m_state.context, &moduleCompileOptions, @@ -175,6 +178,7 @@ void pipelineManager::createPipeline() pipeline_link_options.maxTraceDepth = m_max_trace_depth; // Maximum recursion depth for ray tracing. // Create the OptiX pipeline by linking the program groups. + LOG_SIZE = sizeof(LOG); OPTIX_CHECK(optixPipelineCreate( m_state.context, // OptiX context. &m_state.pipeline_compile_options, // Compile options for the pipeline. @@ -235,6 +239,7 @@ void pipelineManager::createHitGroupProgram(OptixProgramGroup &group, desc.hitgroup.moduleAH = nullptr; desc.hitgroup.entryFunctionNameAH = nullptr; + LOG_SIZE = sizeof(LOG); OPTIX_CHECK(optixProgramGroupCreate( m_state.context, &desc, @@ -345,6 +350,7 @@ void pipelineManager::createMissProgram() desc.miss.entryFunctionName = "__miss__ms"; // Create the program grou + LOG_SIZE = sizeof(LOG); OPTIX_CHECK(optixProgramGroupCreate( m_state.context, &desc, diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 7a3614be..d2691ef8 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -149,6 +149,14 @@ void SolTraceSystem::initialize() OPTIX_CHECK(optixDeviceContextCreate(cuCtx, &options, &m_state.context)); } + // Create the CUDA stream immediately after the context so that all subsequent + // GPU work (GAS build, kernel launches, optixLaunch) uses the same named stream. + // Doing this before create_geometries() ensures optixAccelBuild and optixLaunch + // share a single stream, giving explicit serial ordering without relying solely + // on legacy null-stream synchronization semantics. + if (!m_state.stream) + CUDA_CHECK(cudaStreamCreate(&m_state.stream)); + { size_t mem_total; CUDA_CHECK(cudaMemGetInfo(&m_mem_free_before, &mem_total)); @@ -257,10 +265,6 @@ void SolTraceSystem::initialize() data_manager->launch_params_H.sun_dir_seed = m_seed; data_manager->launch_params_H.optical_errors = m_optical_errors; - // Create a CUDA stream for asynchronous operations (once; guard against re-init leak). - if (!m_state.stream) - CUDA_CHECK(cudaStreamCreate(&m_state.stream)); - // Link the GAS handle. data_manager->launch_params_H.handle = m_state.gas_handle; data_manager->allocateGeometryDataArray(geometry_manager->get_geometry_data_array()); diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index 0831404a..13d205a2 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -140,6 +140,13 @@ namespace OptixCSP bool get_trim_excess_rays() const { return m_trim_excess_rays; } private: + // m_verbose and m_state must be declared before the shared_ptr managers so + // that they are initialized first (C++ initializes members in declaration order). + // GeometryManager and pipelineManager store references/copies of these at + // construction time, so they must be valid when the shared_ptrs are built. + bool m_verbose = false; + OptixCSP::SoltraceState m_state; + std::shared_ptr geometry_manager; std::shared_ptr pipeline_manager; std::shared_ptr data_manager; @@ -150,8 +157,6 @@ namespace OptixCSP uint8_t m_max_ray_depth = DEFAULT_MAX_TRACE_DEPTH; uint_fast64_t m_n_depth_exceeded_rays = 0; // rays stopped by max depth, not absorption - bool m_verbose; - // Sun // OptixCSP::Vec3d m_sun_vector; // double m_sun_angle; @@ -161,8 +166,7 @@ namespace OptixCSP bool m_trim_excess_rays = true; uint64_t m_seed = 123456ULL; - bool m_optical_errors; - OptixCSP::SoltraceState m_state; + bool m_optical_errors = false; // Results diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp index 33581ab2..59702454 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/max_ray_depth_test.cpp @@ -147,10 +147,10 @@ TEST(OptixRunnerMaxRayDepth, TraceDepthNotExceeded) ASSERT_EQ(sts, RunnerStatus::SUCCESS); OptixCSP::SolTraceSystem *sys = runner.get_optix_system(); - std::vector hp_vec; + std::vector hp_vec; std::vector raynumber_vec; - std::vector element_id_vec; - std::vector hit_type_vec; + std::vector element_id_vec; + std::vector hit_type_vec; sys->get_hp_output(hp_vec, raynumber_vec, element_id_vec, hit_type_vec); // Walk through every record. HIT_CREATE marks the start of a new ray; @@ -198,9 +198,10 @@ TEST(OptixRunnerMaxRayDepth, DepthExceededCounterIsZeroWhenDepthNotExceeded) } // With max_depth=2, the two-plate scene where BOTH plates are reflective forces -// every ray to exceed the depth limit on its second hit (plate 2). -// Each ray that hit plate 1 (and reflected toward plate 2) must increment the -// counter exactly once, so the counter must equal the number of hit rays. +// rays that reach plate 2 to exceed the depth limit (new_depth=2 >= max_depth=2, +// and plate 2 is non-absorbing). Not all reflections from plate 1 geometrically +// reach plate 2, so counter <= n_hit. The key check is that counter > 0 and +// never exceeds the total number of hit rays. TEST(OptixRunnerMaxRayDepth, DepthExceededCounterCountsTerminatedReflectedRays) { SimulationData sd; @@ -222,12 +223,12 @@ TEST(OptixRunnerMaxRayDepth, DepthExceededCounterCountsTerminatedReflectedRays) sts = runner.run_simulation(); ASSERT_EQ(sts, RunnerStatus::SUCCESS); - const uint_fast64_t counter = runner.get_N_depth_exceeded_rays(); - const uint_fast64_t n_hit = runner.get_number_rays_traced(); + const uint_fast64_t counter = runner.get_N_depth_exceeded_rays(); + const uint_fast64_t n_hit = runner.get_number_rays_traced(); - // Every ray that hit plate 1 (and was therefore included in hit output) - // also headed toward plate 2, where it would have been cut off. + // Every ray that reached plate 2 (a subset of the plate 1 hits — + // some reflections miss plate 2 entirely) must have incremented the counter. EXPECT_GT(counter, 0u) << "Expected depth-exceeded counter to be non-zero"; - EXPECT_EQ(counter, n_hit) - << "Each hit ray should have triggered exactly one depth-exceeded event on plate 2"; + EXPECT_LE(counter, n_hit) + << "Counter cannot exceed the total number of hit rays"; } From 5a63384f94fa253c2007f3e44bcb9afdf5fc3cf6 Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 27 May 2026 08:33:45 -0600 Subject: [PATCH 58/60] Address copilot comments --- .../OptixCSP/src/core/ray_utils.cu | 2 +- .../OptixCSP/src/core/ray_utils.h | 2 +- .../OptixCSP/src/core/soltrace_system.cpp | 22 ++++++++++++++++--- .../OptixCSP/src/core/soltrace_system.h | 14 ++++++------ .../optix_runner/optix_runner.cpp | 16 +------------- .../optix_runner/sun_test.cpp | 8 ++++--- 6 files changed, 34 insertions(+), 30 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu index 82baac26..a685e2c2 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.cu @@ -97,7 +97,7 @@ namespace OptixCSP // --------------------------------------------------------------------------- // Host-callable scratch management // --------------------------------------------------------------------------- - void allocate_compaction_scratch(CompactionScratch &scratch, uint32_t num_rays, uint32_t max_depth) + void allocate_compaction_scratch(CompactionScratch &scratch, uint64_t num_rays, uint64_t max_depth) { free_compaction_scratch(scratch); diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h index c0783815..06f945b4 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/ray_utils.h @@ -44,7 +44,7 @@ namespace OptixCSP /// Allocate all device scratch buffers for the given ray-buffer dimensions. /// Frees any previous allocation before reallocating. - void allocate_compaction_scratch(CompactionScratch &scratch, uint32_t num_rays, uint32_t max_depth); + void allocate_compaction_scratch(CompactionScratch &scratch, uint64_t num_rays, uint64_t max_depth); /// Free all device scratch buffers and reset the struct to its default state. void free_compaction_scratch(CompactionScratch &scratch); diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index d2691ef8..e8054d6d 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -16,7 +16,6 @@ #include "utils/math_util.h" #include -#include #include #include #include @@ -92,6 +91,23 @@ SolTraceSystem::~SolTraceSystem() clean_up(); } +void SolTraceSystem::set_max_ray_depth(uint64_t depth) +{ + if (depth < 2) + { + std::cerr << "[OptixRunner] WARNING: max_ray_depth (" << depth + << ") is below the minimum of 2. Clamping to 2.\n"; + depth = 2; + } + else if (depth > 255) + { + std::cerr << "[OptixRunner] WARNING: max_ray_depth (" << depth + << ") exceeds the maximum of 255. Clamping to 255.\n"; + depth = 255; + } + m_max_ray_depth = depth; +} + void SolTraceSystem::set_verbose(bool verbose) { m_verbose = verbose; @@ -665,8 +681,8 @@ void SolTraceSystem::allocate_device_buffers() m_hit_buffer_size_allocated = hit_buffer_size; // Reallocate compaction scratch whenever ray-buffer dimensions change - const uint32_t num_rays = data_manager->launch_params_H.width * data_manager->launch_params_H.height; - const uint32_t max_depth = static_cast(data_manager->launch_params_H.max_depth); + const uint64_t num_rays = data_manager->launch_params_H.width * data_manager->launch_params_H.height; + const uint64_t max_depth = static_cast(data_manager->launch_params_H.max_depth); allocate_compaction_scratch(m_compaction_scratch, num_rays, max_depth); } diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h index 13d205a2..0e13fd1a 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.h @@ -86,8 +86,8 @@ namespace OptixCSP uint_fast64_t get_batch_size() const { return m_batch_size; } /// Set the maximum ray interaction depth. Must be called before initialize(). - /// Values are clamped to [1, 255]. Defaults to DEFAULT_MAX_TRACE_DEPTH. - void set_max_ray_depth(uint8_t depth) { m_max_ray_depth = depth; } + /// Values are clamped to [2, 255]. Defaults to DEFAULT_MAX_TRACE_DEPTH = 5. + void set_max_ray_depth(uint64_t depth); uint8_t get_max_ray_depth() const { return m_max_ray_depth; } void set_sun(SolTrace::Data::Sun *sun) { m_sun = sun; } @@ -226,10 +226,10 @@ namespace OptixCSP uint64_t m_n_run_iterations; // memory usage - size_t m_mem_free_before; ///< Free GPU memory at the start of initialize(), before any setup allocations. - size_t m_mem_free_post_setup; ///< Free GPU memory at the end of initialize(), after all setup allocations (BVH, - /// pipeline, SBT, geometry/material arrays). Used as the baseline in - /// automatic_batch_size() so batch sizing is stable across run() calls. - size_t m_mem_free_after; ///< Free GPU memory sampled during run() for per-launch memory reporting. + size_t m_mem_free_before; ///< Free GPU memory at the start of initialize(), before any setup allocations. + size_t m_mem_free_post_setup; ///< Free GPU memory at the end of initialize(), after all setup allocations (BVH, + /// pipeline, SBT, geometry/material arrays). Used as the baseline in + /// automatic_batch_size() so batch sizing is stable across run() calls. + size_t m_mem_free_after; ///< Free GPU memory sampled during run() for per-launch memory reporting. }; } diff --git a/coretrace/simulation_runner/optix_runner/optix_runner.cpp b/coretrace/simulation_runner/optix_runner/optix_runner.cpp index e0d341e5..532638a6 100644 --- a/coretrace/simulation_runner/optix_runner/optix_runner.cpp +++ b/coretrace/simulation_runner/optix_runner/optix_runner.cpp @@ -1,6 +1,4 @@ #include "simulation_runner/optix_runner/optix_runner.hpp" -#include "simulation_data/simulation_parameters.hpp" -#include "simulation_data/simulation_data.hpp" #include "simulation_data/simulation_data_export.hpp" #include @@ -33,19 +31,7 @@ void OptixRunner::print_timing() const void OptixRunner::set_max_ray_depth(uint_fast64_t depth) { - if (depth < 2) - { - std::cerr << "[OptixRunner] WARNING: max_ray_depth (" << depth - << ") is below the minimum of 2. Clamping to 2.\n"; - depth = 2; - } - else if (depth > 255) - { - std::cerr << "[OptixRunner] WARNING: max_ray_depth (" << depth - << ") exceeds the maximum of 255. Clamping to 255.\n"; - depth = 255; - } - m_sys.set_max_ray_depth(static_cast(depth)); + m_sys.set_max_ray_depth(depth); } void OptixRunner::set_batch_size(uint_fast64_t batch_size) diff --git a/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp b/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp index 96c7af34..5db70cdf 100644 --- a/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp +++ b/google-tests/unit-tests/simulation_runner/optix_runner/sun_test.cpp @@ -825,10 +825,10 @@ TEST(Sun, HaltonWarningWhenExceedingUInt32Max) OptixRunner runner; testing::internal::CaptureStderr(); - runner.setup_simulation(&sd); + ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS); const std::string output = testing::internal::GetCapturedStderr(); - EXPECT_NE(output.find("Warning"), std::string::npos) + EXPECT_NE(output.find("Halton"), std::string::npos) << "Expected a warning about Halton index overflow in stderr, but got: " << output; } @@ -851,9 +851,11 @@ TEST(Sun, NoHaltonWarningWithinUInt32Range) OptixRunner runner; testing::internal::CaptureStderr(); - runner.setup_simulation(&sd); + ASSERT_EQ(runner.setup_simulation(&sd), RunnerStatus::SUCCESS); const std::string output = testing::internal::GetCapturedStderr(); + // Checking for "Warning" may result in failure due to other warning. Since + // this should be clean, I consider that to be a good thing. EXPECT_EQ(output.find("Warning"), std::string::npos) << "Unexpected Halton warning in stderr: " << output; } From 874f4dd1c1661dc3efecd45d4c59a3c5e0eea2fa Mon Sep 17 00:00:00 2001 From: Jonathan Maack Date: Wed, 27 May 2026 09:05:52 -0600 Subject: [PATCH 59/60] One last copilot comment fix --- .../OptixCSP/src/core/soltrace_system.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index e8054d6d..9d8c1e46 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -203,7 +203,7 @@ void SolTraceSystem::initialize() // Assign sun shape parameters (if necessary) data_manager->launch_params_H.include_sun_shape_errors = this->m_include_sun_shape_errors; - data_manager->allocateSunUserData({}, {}); // Clear sun user data + data_manager->allocateSunUserData({}, {}); // Clear sun user data if (this->m_include_sun_shape_errors) { // Map SolTrace::Data::SunShape to OptixCSP::SunShape for device code @@ -541,9 +541,9 @@ void SolTraceSystem::clean_up() m_state.sbt = {}; m_state.d_gas_output_buffer = 0; - m_mem_free_before = 0; + m_mem_free_before = 0; m_mem_free_post_setup = 0; - m_mem_free_after = 0; + m_mem_free_after = 0; } void SolTraceSystem::reset() @@ -823,7 +823,7 @@ void SolTraceSystem::print_timing() const if (m_mem_free_before > 0) { std::cout << std::fixed << std::setprecision(2); - std::cout << " Free before setup : " << m_mem_free_before * kMB << " MB\n"; + std::cout << " Free before setup : " << m_mem_free_before * kMB << " MB\n"; if (m_mem_free_post_setup > 0) { std::cout << " Free after setup : " << m_mem_free_post_setup * kMB << " MB\n"; @@ -881,10 +881,10 @@ uint_fast64_t SolTraceSystem::automatic_batch_size() const const uint_fast64_t computed = (bytes_per_ray > 0) ? static_cast(usable_bytes / bytes_per_ray) : 0u; - // Cap at int max (OptiX launch width is signed int). + // Cap at int max / m_max_ray_depth (OptiX launch width is signed int). uint_fast64_t batch_size = std::min( computed, - static_cast(std::numeric_limits::max())); + static_cast(std::numeric_limits::max() / m_max_ray_depth)); if (m_verbose) { From dac334b87c3c6a5c9d7181dd035550277bde84f2 Mon Sep 17 00:00:00 2001 From: Taylor Brown <60201147+taylorbrown75@users.noreply.github.com> Date: Wed, 27 May 2026 11:18:22 -0600 Subject: [PATCH 60/60] Fix macro conflict on windows --- .../optix_runner/OptixCSP/src/core/soltrace_system.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp index 9d8c1e46..ac1840a8 100644 --- a/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp +++ b/coretrace/simulation_runner/optix_runner/OptixCSP/src/core/soltrace_system.cpp @@ -1,3 +1,7 @@ +#ifndef NOMINMAX +#define NOMINMAX +#endif + #include "soltrace_system.h" #include "ray_utils.h"